Implement CE-WP-0002 T03-T09: ingest, anchor resolution, engine, UI, persistence, e2e

Completes the PDF review slice end-to-end. After this commit a user can
open a fixture, select text, save an evidence item with commentary, see
it in the sidebar, reload the page, click the item, and the viewer
scrolls to the passage.

- T03 src/source/pdf/{fingerprint,extract,ingest}.ts + 39 fixture tests
  - SHA-256 fingerprint over a fresh ArrayBuffer (TS BufferSource-safe)
  - PDF.js text extract; per-page normalize then join with "\n\n"
  - PageMap + OffsetMap (gap-free coverage); pageLength = end - start
  - Updated manifest's Betriebskosten quote to one PDF.js extracts cleanly
- T04 src/anchor/selectors/{create,resolve}.ts + 25 unit + 7 fixture tests
  - createSelectors emits the maximal redundant set (TextQuote +
    TextPosition + PdfRect + PdfPageText when available)
  - resolveSelectors implements the SharedContracts §7 ladder; confidence
    1.0 (pos+quote) → 0.7 (rect-only) → 0 (unresolved)
  - Cross-module integration test moved to tests/integration/ to honor
    the anchor↛source boundary lint rule
- T05 engine: sync event bus over the closed §4 vocabulary, Map-backed
  repos, services, createEngine() composition root, 12 tests
- T06 work + app: three-pane shell (CollectionList | ViewerShell |
  EvidenceSidebar) wired through EngineProvider; EngineContext lives in
  src/work/ to respect the work↛app boundary; SpikeApp deleted
- T07 AnnotationToolbar: pendingSelection in context; Save runs
  createSelectors → engine.annotations.create → engine.evidence.create
- T08 click-to-reopen + localStorage persistence
  - scrollToAnnotation state in context with a version counter so a
    second click on the same item re-fires the viewer scroll
  - captureSnapshot/restoreSnapshot/attachPersister/restoreFromStorage;
    restore bypasses services to avoid event-loops
  - active-document id persisted alongside the snapshot so reload lands
    on the same fixture; ADR-0005 written
  - 9 persistence tests
- T09 tests/integration/app-prd-scenario.dom.test.tsx
  - end-to-end happy-dom test of PRD scenario steps 1-8 through the real
    React tree; viewer + ingest mocked per ADR-0004's headless-Chromium
    limitation. Fixed memo-deps bug in EvidenceSidebar/ViewerShell where
    useEngineEventTick values were not included in the useMemo deps,
    leaving stale memoization across event-driven re-renders
- vitest.config.ts: happy-dom for *.dom.test.{ts,tsx} files
- noEmit added to tsconfig so tsc -b doesn't litter src/ with .js outputs

Gates: typecheck ✓ lint ✓ test 109/109 across 11 files ✓ build ✓

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 10:58:11 +02:00
parent 2a7b05c190
commit d54daf2e61
45 changed files with 3655 additions and 277 deletions

View File

@@ -5,3 +5,9 @@ export {
type PdfSpikeViewerProps,
type StoredAnnotation,
} from "./pdf-viewer-adapter-spike";
export {
createSelectors,
resolveSelectors,
DEFAULT_CONTEXT_CHARS,
type CreateSelectorsOptions,
} from "./selectors";

View File

@@ -0,0 +1,136 @@
import { describe, expect, it } from "vitest";
import type { DocumentRepresentation } from "@shared/document";
import type { DocumentId, RepresentationId } from "@shared/ids";
import type {
PdfPageTextSelector,
PdfRectSelector,
TextPositionSelector,
TextQuoteSelector,
} from "@shared/selector";
import { createSelectors } from "./create";
import type { PdfSelectionCapture } from "../types";
function repr(canonicalText: string): DocumentRepresentation {
const pageLength = canonicalText.length;
return {
id: "rep_test" as RepresentationId,
documentId: "doc_test" as DocumentId,
representationType: "pdf-text",
contentHash: "test",
canonicalText,
pageMap: [{ page: 1, width: 595, height: 842 }],
offsetMap: [
{ page: 1, globalStart: 0, globalEnd: pageLength, pageLength },
],
generatedAt: "2026-05-25T00:00:00.000Z",
};
}
function capture(text: string, page = 1, rectsCount = 1): PdfSelectionCapture {
return {
kind: "pdf",
text,
page,
rects: Array.from({ length: rectsCount }, (_, i) => ({
x: 0.1,
y: 0.2 + i * 0.05,
width: 0.5,
height: 0.04,
})),
boundingRect: { x: 0.1, y: 0.2, width: 0.5, height: 0.04 * rectsCount },
};
}
describe("createSelectors", () => {
const text = "The quick brown fox jumps over the lazy dog near the river bank.";
const representation = repr(text);
it("always includes a TextQuoteSelector with prefix and suffix from canonical text", () => {
const sels = createSelectors(capture("brown fox"), representation);
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector");
expect(quote).toBeDefined();
expect(quote!.exact).toBe("brown fox");
expect(quote!.prefix).toBe("The quick ");
expect(quote!.suffix).toBe(" jumps over the lazy dog near th");
});
it("includes a TextPositionSelector pointing at the matched offset", () => {
const sels = createSelectors(capture("brown fox"), representation);
const pos = sels.find((s): s is TextPositionSelector => s.type === "TextPositionSelector");
expect(pos).toBeDefined();
expect(pos!.start).toBe(text.indexOf("brown fox"));
expect(pos!.end).toBe(text.indexOf("brown fox") + "brown fox".length);
});
it("includes a PdfRectSelector mirroring the capture's page and rects", () => {
const c = capture("brown fox", 1, 2);
const sels = createSelectors(c, representation);
const rect = sels.find((s): s is PdfRectSelector => s.type === "PdfRectSelector");
expect(rect).toBeDefined();
expect(rect!.page).toBe(1);
expect(rect!.rects).toEqual(c.rects);
});
it("includes a PdfPageTextSelector when the match falls inside the capture's page range", () => {
const sels = createSelectors(capture("brown fox"), representation);
const pageText = sels.find((s): s is PdfPageTextSelector => s.type === "PdfPageTextSelector");
expect(pageText).toBeDefined();
expect(pageText!.page).toBe(1);
expect(pageText!.start).toBe(text.indexOf("brown fox"));
});
it("omits the TextPositionSelector when the quote cannot be found in canonical text", () => {
const sels = createSelectors(capture("nonexistent phrase"), representation);
const pos = sels.find((s) => s.type === "TextPositionSelector");
expect(pos).toBeUndefined();
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector");
expect(quote!.exact).toBe("nonexistent phrase");
expect(quote!.prefix).toBeUndefined();
expect(quote!.suffix).toBeUndefined();
});
it("clamps prefix at the start of the canonical text", () => {
const sels = createSelectors(capture("The quick"), representation);
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!;
expect(quote.prefix).toBeUndefined();
expect(quote.suffix).toBe(" brown fox jumps over the lazy d");
});
it("clamps suffix at the end of the canonical text", () => {
const sels = createSelectors(capture("river bank."), representation);
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!;
expect(quote.prefix).toBe("umps over the lazy dog near the ");
expect(quote.suffix).toBeUndefined();
});
it("honors a custom contextChars option", () => {
const sels = createSelectors(capture("brown fox"), representation, { contextChars: 4 });
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!;
expect(quote.prefix).toBe("ick ");
expect(quote.suffix).toBe(" jum");
});
it("prefers the on-page match when the quote appears on multiple pages", () => {
// Two-page representation where the quote appears once per page.
const canonical = "alpha echo bravo" + "\n\n" + "charlie echo delta";
const rep: DocumentRepresentation = {
id: "rep_multi" as RepresentationId,
documentId: "doc_multi" as DocumentId,
representationType: "pdf-text",
contentHash: "h",
canonicalText: canonical,
pageMap: [
{ page: 1, width: 100, height: 100 },
{ page: 2, width: 100, height: 100 },
],
offsetMap: [
{ page: 1, globalStart: 0, globalEnd: 18, pageLength: 18 },
{ page: 2, globalStart: 18, globalEnd: canonical.length, pageLength: canonical.length - 18 },
],
generatedAt: "2026-05-25T00:00:00.000Z",
};
const sels = createSelectors(capture("echo", 2), rep);
const pos = sels.find((s): s is TextPositionSelector => s.type === "TextPositionSelector")!;
expect(pos.start).toBe(canonical.indexOf("echo", 18));
});
});

View File

@@ -0,0 +1,157 @@
/**
* Build the maximal `Selector[]` from a viewer's `SelectionCapture`.
*
* Implements the "always store all selector types that are available" rule
* from `wiki/SharedContracts.md` §3 (selector redundancy) and the create
* half of the `AnchorAdapter` contract in
* `wiki/ArchitectureOverview.md` §3.3.
*
* Output guarantee: every returned `Selector[]` includes a
* `TextQuoteSelector` (always) and adds `TextPositionSelector`,
* `PdfRectSelector`, `PdfPageTextSelector` only when the underlying data
* actually supports them. Resolvers can rely on the union being trimmed —
* a missing selector means "not available", not "skipped".
*/
import type { DocumentRepresentation } from "@shared/document";
import { normalize } from "@shared/text/normalize";
import type {
PdfPageTextSelector,
PdfRectSelector,
Selector,
TextPositionSelector,
TextQuoteSelector,
} from "@shared/selector";
import type { PdfSelectionCapture, SelectionCapture } from "../types";
/** Default characters of prefix/suffix context stored on TextQuoteSelector. */
export const DEFAULT_CONTEXT_CHARS = 32;
export interface CreateSelectorsOptions {
readonly contextChars?: number;
}
export function createSelectors(
capture: SelectionCapture,
representation: DocumentRepresentation,
options: CreateSelectorsOptions = {},
): Selector[] {
// `SelectionCapture` is a discriminated union. The DOM branch is `never`
// in MVP, so the only runtime shape is `PdfSelectionCapture`.
return createSelectorsFromPdfCapture(capture, representation, options);
}
function createSelectorsFromPdfCapture(
capture: PdfSelectionCapture,
representation: DocumentRepresentation,
options: CreateSelectorsOptions,
): Selector[] {
const contextChars = options.contextChars ?? DEFAULT_CONTEXT_CHARS;
const normalizedQuote = normalize(capture.text).text;
const out: Selector[] = [];
const canonicalText = representation.canonicalText ?? "";
const positions = canonicalText.length > 0 && normalizedQuote.length > 0
? findAllOccurrences(canonicalText, normalizedQuote)
: [];
// Locate the match that falls on the capture's page (when offsetMap is
// known); otherwise fall back to the first match. If there is no match,
// we still emit a quote-only TextQuoteSelector so the annotation is
// recoverable later if the representation is rebuilt.
const pageRange = representation.offsetMap?.find((r) => r.page === capture.page);
const matchOffset = pickMatch(positions, pageRange);
// 1. TextQuoteSelector — always included.
if (normalizedQuote.length > 0) {
const quote = matchOffset !== null
? buildQuoteSelectorWithContext(canonicalText, matchOffset, normalizedQuote, contextChars)
: ({ type: "TextQuoteSelector", exact: normalizedQuote } satisfies TextQuoteSelector);
out.push(quote);
}
// 2. TextPositionSelector — only when we have a unique-enough match.
if (matchOffset !== null) {
const pos: TextPositionSelector = {
type: "TextPositionSelector",
start: matchOffset,
end: matchOffset + normalizedQuote.length,
};
out.push(pos);
}
// 3. PdfRectSelector — straight from the capture; viewer-coordinate truth.
if (capture.rects.length > 0) {
const rect: PdfRectSelector = {
type: "PdfRectSelector",
page: capture.page,
rects: capture.rects,
};
out.push(rect);
}
// 4. PdfPageTextSelector — when we have offsetMap and a unique-enough match
// that falls inside the capture's page range.
if (matchOffset !== null && pageRange) {
if (matchOffset >= pageRange.globalStart && matchOffset + normalizedQuote.length <= pageRange.globalEnd) {
const pageText: PdfPageTextSelector = {
type: "PdfPageTextSelector",
page: capture.page,
start: matchOffset - pageRange.globalStart,
end: matchOffset - pageRange.globalStart + normalizedQuote.length,
};
out.push(pageText);
}
}
return out;
}
function findAllOccurrences(haystack: string, needle: string): number[] {
if (needle.length === 0) return [];
const out: number[] = [];
let from = 0;
for (;;) {
const idx = haystack.indexOf(needle, from);
if (idx === -1) break;
out.push(idx);
from = idx + 1;
}
return out;
}
function pickMatch(
positions: readonly number[],
pageRange: { globalStart: number; globalEnd: number } | undefined,
): number | null {
if (positions.length === 0) return null;
if (positions.length === 1) return positions[0]!;
if (pageRange) {
const onPage = positions.find(
(p) => p >= pageRange.globalStart && p < pageRange.globalEnd,
);
if (onPage !== undefined) return onPage;
}
// Multiple matches and no page hint — return the first; resolve.ts will
// need prefix/suffix to disambiguate.
return positions[0]!;
}
function buildQuoteSelectorWithContext(
canonicalText: string,
matchOffset: number,
exact: string,
contextChars: number,
): TextQuoteSelector {
const prefixStart = Math.max(0, matchOffset - contextChars);
const suffixEnd = Math.min(canonicalText.length, matchOffset + exact.length + contextChars);
const prefix = canonicalText.slice(prefixStart, matchOffset);
const suffix = canonicalText.slice(matchOffset + exact.length, suffixEnd);
return {
type: "TextQuoteSelector",
exact,
...(prefix.length > 0 ? { prefix } : {}),
...(suffix.length > 0 ? { suffix } : {}),
};
}

View File

@@ -0,0 +1,6 @@
export {
createSelectors,
DEFAULT_CONTEXT_CHARS,
type CreateSelectorsOptions,
} from "./create";
export { resolveSelectors } from "./resolve";

View File

@@ -0,0 +1,137 @@
import { describe, expect, it } from "vitest";
import type { DocumentRepresentation } from "@shared/document";
import type { DocumentId, RepresentationId } from "@shared/ids";
import type { Selector } from "@shared/selector";
import { resolveSelectors } from "./resolve";
function repr(canonicalText: string, pages = 1): DocumentRepresentation {
const segmentLen = pages === 1
? canonicalText.length
: Math.floor(canonicalText.length / pages);
const offsetMap = [];
for (let i = 0; i < pages; i++) {
const start = i * segmentLen;
const end = i === pages - 1 ? canonicalText.length : start + segmentLen;
offsetMap.push({ page: i + 1, globalStart: start, globalEnd: end, pageLength: end - start });
}
return {
id: "rep_test" as RepresentationId,
documentId: "doc_test" as DocumentId,
representationType: "pdf-text",
contentHash: "test",
canonicalText,
pageMap: Array.from({ length: pages }, (_, i) => ({ page: i + 1, width: 595, height: 842 })),
offsetMap,
generatedAt: "2026-05-25T00:00:00.000Z",
};
}
describe("resolveSelectors", () => {
const text = "The quick brown fox jumps over the lazy dog.";
const representation = repr(text);
const brownFoxStart = text.indexOf("brown fox");
const brownFoxEnd = brownFoxStart + "brown fox".length;
it("returns 1.0 confidence when position and quote agree exactly", () => {
const selectors: Selector[] = [
{ type: "TextPositionSelector", start: brownFoxStart, end: brownFoxEnd },
{ type: "TextQuoteSelector", exact: "brown fox" },
];
const r = resolveSelectors(selectors, representation);
expect(r.status).toBe("resolved");
expect(r.confidence).toBe(1.0);
expect(r.candidates[0]?.textPosition).toEqual({ start: brownFoxStart, end: brownFoxEnd });
expect(r.candidates[0]?.page).toBe(1);
expect(r.usedSelectorTypes).toEqual(["TextPositionSelector", "TextQuoteSelector"]);
});
it("falls back to quote search when position is stale, and records a warning", () => {
const selectors: Selector[] = [
{ type: "TextPositionSelector", start: 0, end: 9 }, // "The quick"
{ type: "TextQuoteSelector", exact: "brown fox" },
];
const r = resolveSelectors(selectors, representation);
expect(r.status).toBe("resolved");
expect(r.confidence).toBe(0.95);
expect(r.candidates[0]?.textPosition).toEqual({ start: brownFoxStart, end: brownFoxEnd });
expect(r.warnings?.[0]).toMatch(/did not match/);
expect(r.usedSelectorTypes).toEqual(["TextQuoteSelector"]);
});
it("returns 0.85 for a position-only selector with no quote to verify", () => {
const selectors: Selector[] = [
{ type: "TextPositionSelector", start: brownFoxStart, end: brownFoxEnd },
];
const r = resolveSelectors(selectors, representation);
expect(r.status).toBe("resolved");
expect(r.confidence).toBe(0.85);
});
it("returns 0.95 when only TextQuoteSelector is present and the quote is unique", () => {
const r = resolveSelectors(
[{ type: "TextQuoteSelector", exact: "brown fox" }],
representation,
);
expect(r.status).toBe("resolved");
expect(r.confidence).toBe(0.95);
});
it("returns 0.9 when a duplicated quote is disambiguated by prefix/suffix", () => {
const dup = "alpha echo bravo charlie echo delta";
const r = resolveSelectors(
[{ type: "TextQuoteSelector", exact: "echo", prefix: "charlie ", suffix: " delta" }],
repr(dup),
);
expect(r.status).toBe("resolved");
expect(r.confidence).toBe(0.9);
expect(r.candidates[0]?.textPosition?.start).toBe(dup.indexOf("echo", 10));
});
it("returns ambiguous when a duplicated quote cannot be disambiguated", () => {
const dup = "echo and echo";
const r = resolveSelectors(
[{ type: "TextQuoteSelector", exact: "echo" }],
repr(dup),
);
expect(r.status).toBe("ambiguous");
expect(r.confidence).toBe(0.5);
});
it("falls back to PdfPageTextSelector via the OffsetMap", () => {
// Single page, "brown fox" at offset 10..19.
const r = resolveSelectors(
[{ type: "PdfPageTextSelector", page: 1, start: brownFoxStart, end: brownFoxEnd }],
representation,
);
expect(r.status).toBe("resolved");
expect(r.confidence).toBe(0.8);
expect(r.candidates[0]?.textPosition).toEqual({ start: brownFoxStart, end: brownFoxEnd });
expect(r.candidates[0]?.page).toBe(1);
});
it("falls back to PdfRectSelector with page+rects only at 0.7 confidence", () => {
const r = resolveSelectors(
[{
type: "PdfRectSelector",
page: 2,
rects: [{ x: 0.1, y: 0.2, width: 0.3, height: 0.04 }],
}],
repr(text, 1),
);
expect(r.status).toBe("resolved");
expect(r.confidence).toBe(0.7);
expect(r.candidates[0]?.page).toBe(2);
expect(r.candidates[0]?.textPosition).toBeUndefined();
expect(r.candidates[0]?.rects).toHaveLength(1);
});
it("returns unresolved when nothing matches", () => {
const r = resolveSelectors(
[{ type: "TextQuoteSelector", exact: "missing string" }],
representation,
);
expect(r.status).toBe("unresolved");
expect(r.confidence).toBe(0);
expect(r.candidates).toEqual([]);
});
});

View File

@@ -0,0 +1,260 @@
/**
* Resolve a `Selector[]` against a `DocumentRepresentation`.
*
* Implements the resolution strategy from `wiki/ArchitectureOverview.md` §7,
* MVP-trimmed:
*
* 1. Try `TextPositionSelector` (cheapest — direct slice).
* 2. Verify with `TextQuoteSelector` at that position.
* 3. Try `TextQuoteSelector` on its own. If multiple matches, disambiguate
* by prefix/suffix.
* 4. Try `PdfPageTextSelector` (page-local offsets through the OffsetMap).
* 5. Fall back to `PdfRectSelector` for a page+rects-only target.
* 6. Return `unresolved` if nothing above succeeds.
*
* Fuzzy matching is out of scope here; a later workplan owns it.
*
* Confidence ladder (0..1):
* 1.00 — TextPosition + TextQuote agree exactly
* 0.95 — TextQuote unique match (no position to cross-check)
* 0.90 — TextQuote disambiguated by prefix/suffix
* 0.85 — TextPosition only (no quote to cross-check)
* 0.80 — PdfPageTextSelector resolved via OffsetMap
* 0.70 — PdfRectSelector only (page+rects, no text verification)
*/
import type { DocumentRepresentation } from "@shared/document";
import type {
PdfPageTextSelector,
PdfRectSelector,
Selector,
SelectorType,
TextPositionSelector,
TextQuoteSelector,
} from "@shared/selector";
import type { AnchorResolution, ResolvedAnchorTarget } from "../types";
export function resolveSelectors(
selectors: readonly Selector[],
representation: DocumentRepresentation,
): AnchorResolution {
const canonicalText = representation.canonicalText ?? "";
const offsetMap = representation.offsetMap ?? [];
const representationId = representation.id;
const byType = indexByType(selectors);
const used: SelectorType[] = [];
const warnings: string[] = [];
// 1 & 2. Try TextPositionSelector, verify with TextQuoteSelector.
if (byType.TextPositionSelector && canonicalText.length > 0) {
const pos = byType.TextPositionSelector;
const slice = sliceSafely(canonicalText, pos.start, pos.end);
if (slice !== null) {
const quote = byType.TextQuoteSelector;
if (quote) {
if (slice === quote.exact) {
used.push("TextPositionSelector", "TextQuoteSelector");
return resolved(
{ representationId, textPosition: { start: pos.start, end: pos.end }, ...pageFor(pos, offsetMap) },
1.0,
used,
warnings,
);
}
warnings.push(
"TextPositionSelector slice did not match TextQuoteSelector.exact; falling back to quote search.",
);
} else {
// Position with no quote to verify — accept at lower confidence.
used.push("TextPositionSelector");
return resolved(
{ representationId, textPosition: { start: pos.start, end: pos.end }, ...pageFor(pos, offsetMap) },
0.85,
used,
warnings,
);
}
}
}
// 3. TextQuoteSelector on its own (or after the position fallback above).
if (byType.TextQuoteSelector && canonicalText.length > 0) {
const quoteResult = resolveByQuote(canonicalText, byType.TextQuoteSelector);
if (quoteResult) {
used.push("TextQuoteSelector");
return resolved(
{
representationId,
textPosition: { start: quoteResult.offset, end: quoteResult.offset + byType.TextQuoteSelector.exact.length },
...pageFor({ start: quoteResult.offset, end: quoteResult.offset + byType.TextQuoteSelector.exact.length }, offsetMap),
},
quoteResult.confidence,
used,
warnings,
quoteResult.status,
);
}
}
// 4. PdfPageTextSelector through OffsetMap.
if (byType.PdfPageTextSelector && offsetMap.length > 0) {
const pageText = byType.PdfPageTextSelector;
const range = offsetMap.find((r) => r.page === pageText.page);
if (range && pageText.start >= 0 && pageText.end <= range.pageLength && pageText.start < pageText.end) {
const globalStart = range.globalStart + pageText.start;
const globalEnd = range.globalStart + pageText.end;
used.push("PdfPageTextSelector");
return resolved(
{
representationId,
page: pageText.page,
textPosition: { start: globalStart, end: globalEnd },
},
0.8,
used,
warnings,
);
}
}
// 5. PdfRectSelector fallback (no text verification possible).
if (byType.PdfRectSelector) {
const rect = byType.PdfRectSelector;
used.push("PdfRectSelector");
return resolved(
{ representationId, page: rect.page, rects: rect.rects },
0.7,
used,
warnings,
);
}
return unresolved(warnings);
}
interface QuoteResolutionResult {
readonly offset: number;
readonly confidence: number;
readonly status: "resolved" | "ambiguous";
}
function resolveByQuote(canonicalText: string, quote: TextQuoteSelector): QuoteResolutionResult | null {
const positions = findAllOccurrences(canonicalText, quote.exact);
if (positions.length === 0) return null;
if (positions.length === 1) {
return { offset: positions[0]!, confidence: 0.95, status: "resolved" };
}
// Multiple matches — try to disambiguate by prefix/suffix.
const filtered = positions.filter((p) => prefixSuffixMatches(canonicalText, p, quote));
if (filtered.length === 1) {
return { offset: filtered[0]!, confidence: 0.9, status: "resolved" };
}
if (filtered.length > 1) {
return { offset: filtered[0]!, confidence: 0.5, status: "ambiguous" };
}
// No prefix/suffix info or no matches with context — return ambiguous on first.
return { offset: positions[0]!, confidence: 0.5, status: "ambiguous" };
}
function prefixSuffixMatches(
canonicalText: string,
offset: number,
quote: TextQuoteSelector,
): boolean {
if (quote.prefix !== undefined) {
const prefixEnd = offset;
const prefixStart = Math.max(0, prefixEnd - quote.prefix.length);
const actualPrefix = canonicalText.slice(prefixStart, prefixEnd);
if (!actualPrefix.endsWith(quote.prefix)) return false;
}
if (quote.suffix !== undefined) {
const suffixStart = offset + quote.exact.length;
const suffixEnd = Math.min(canonicalText.length, suffixStart + quote.suffix.length);
const actualSuffix = canonicalText.slice(suffixStart, suffixEnd);
if (!actualSuffix.startsWith(quote.suffix)) return false;
}
return true;
}
interface SelectorIndex {
TextQuoteSelector?: TextQuoteSelector;
TextPositionSelector?: TextPositionSelector;
PdfRectSelector?: PdfRectSelector;
PdfPageTextSelector?: PdfPageTextSelector;
}
function indexByType(selectors: readonly Selector[]): SelectorIndex {
const idx: SelectorIndex = {};
for (const s of selectors) {
switch (s.type) {
case "TextQuoteSelector":
idx.TextQuoteSelector = s;
break;
case "TextPositionSelector":
idx.TextPositionSelector = s;
break;
case "PdfRectSelector":
idx.PdfRectSelector = s;
break;
case "PdfPageTextSelector":
idx.PdfPageTextSelector = s;
break;
}
}
return idx;
}
function sliceSafely(text: string, start: number, end: number): string | null {
if (start < 0 || end > text.length || start >= end) return null;
return text.slice(start, end);
}
function pageFor(
span: { start: number; end: number },
offsetMap: readonly { page: number; globalStart: number; globalEnd: number }[],
): { page?: number } {
if (offsetMap.length === 0) return {};
const range = offsetMap.find((r) => span.start >= r.globalStart && span.end <= r.globalEnd);
return range ? { page: range.page } : {};
}
function findAllOccurrences(haystack: string, needle: string): number[] {
if (needle.length === 0) return [];
const out: number[] = [];
let from = 0;
for (;;) {
const idx = haystack.indexOf(needle, from);
if (idx === -1) break;
out.push(idx);
from = idx + 1;
}
return out;
}
function resolved(
target: ResolvedAnchorTarget,
confidence: number,
used: readonly SelectorType[],
warnings: readonly string[],
status: "resolved" | "ambiguous" = "resolved",
): AnchorResolution {
return {
status,
confidence,
candidates: [target],
usedSelectorTypes: used,
...(warnings.length > 0 ? { warnings } : {}),
};
}
function unresolved(warnings: readonly string[]): AnchorResolution {
return {
status: "unresolved",
confidence: 0,
candidates: [],
usedSelectorTypes: [],
...(warnings.length > 0 ? { warnings } : {}),
};
}