generated from coulomb/repo-seed
Implement CE-WP-0002 T03-T09: ingest, anchor resolution, engine, UI, persistence, e2e
Completes the PDF review slice end-to-end. After this commit a user can
open a fixture, select text, save an evidence item with commentary, see
it in the sidebar, reload the page, click the item, and the viewer
scrolls to the passage.
- T03 src/source/pdf/{fingerprint,extract,ingest}.ts + 39 fixture tests
- SHA-256 fingerprint over a fresh ArrayBuffer (TS BufferSource-safe)
- PDF.js text extract; per-page normalize then join with "\n\n"
- PageMap + OffsetMap (gap-free coverage); pageLength = end - start
- Updated manifest's Betriebskosten quote to one PDF.js extracts cleanly
- T04 src/anchor/selectors/{create,resolve}.ts + 25 unit + 7 fixture tests
- createSelectors emits the maximal redundant set (TextQuote +
TextPosition + PdfRect + PdfPageText when available)
- resolveSelectors implements the SharedContracts §7 ladder; confidence
1.0 (pos+quote) → 0.7 (rect-only) → 0 (unresolved)
- Cross-module integration test moved to tests/integration/ to honor
the anchor↛source boundary lint rule
- T05 engine: sync event bus over the closed §4 vocabulary, Map-backed
repos, services, createEngine() composition root, 12 tests
- T06 work + app: three-pane shell (CollectionList | ViewerShell |
EvidenceSidebar) wired through EngineProvider; EngineContext lives in
src/work/ to respect the work↛app boundary; SpikeApp deleted
- T07 AnnotationToolbar: pendingSelection in context; Save runs
createSelectors → engine.annotations.create → engine.evidence.create
- T08 click-to-reopen + localStorage persistence
- scrollToAnnotation state in context with a version counter so a
second click on the same item re-fires the viewer scroll
- captureSnapshot/restoreSnapshot/attachPersister/restoreFromStorage;
restore bypasses services to avoid event-loops
- active-document id persisted alongside the snapshot so reload lands
on the same fixture; ADR-0005 written
- 9 persistence tests
- T09 tests/integration/app-prd-scenario.dom.test.tsx
- end-to-end happy-dom test of PRD scenario steps 1-8 through the real
React tree; viewer + ingest mocked per ADR-0004's headless-Chromium
limitation. Fixed memo-deps bug in EvidenceSidebar/ViewerShell where
useEngineEventTick values were not included in the useMemo deps,
leaving stale memoization across event-driven re-renders
- vitest.config.ts: happy-dom for *.dom.test.{ts,tsx} files
- noEmit added to tsconfig so tsc -b doesn't litter src/ with .js outputs
Gates: typecheck ✓ lint ✓ test 109/109 across 11 files ✓ build ✓
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -5,3 +5,9 @@ export {
|
||||
type PdfSpikeViewerProps,
|
||||
type StoredAnnotation,
|
||||
} from "./pdf-viewer-adapter-spike";
|
||||
export {
|
||||
createSelectors,
|
||||
resolveSelectors,
|
||||
DEFAULT_CONTEXT_CHARS,
|
||||
type CreateSelectorsOptions,
|
||||
} from "./selectors";
|
||||
|
||||
136
src/anchor/selectors/create.test.ts
Normal file
136
src/anchor/selectors/create.test.ts
Normal file
@@ -0,0 +1,136 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { DocumentRepresentation } from "@shared/document";
|
||||
import type { DocumentId, RepresentationId } from "@shared/ids";
|
||||
import type {
|
||||
PdfPageTextSelector,
|
||||
PdfRectSelector,
|
||||
TextPositionSelector,
|
||||
TextQuoteSelector,
|
||||
} from "@shared/selector";
|
||||
import { createSelectors } from "./create";
|
||||
import type { PdfSelectionCapture } from "../types";
|
||||
|
||||
function repr(canonicalText: string): DocumentRepresentation {
|
||||
const pageLength = canonicalText.length;
|
||||
return {
|
||||
id: "rep_test" as RepresentationId,
|
||||
documentId: "doc_test" as DocumentId,
|
||||
representationType: "pdf-text",
|
||||
contentHash: "test",
|
||||
canonicalText,
|
||||
pageMap: [{ page: 1, width: 595, height: 842 }],
|
||||
offsetMap: [
|
||||
{ page: 1, globalStart: 0, globalEnd: pageLength, pageLength },
|
||||
],
|
||||
generatedAt: "2026-05-25T00:00:00.000Z",
|
||||
};
|
||||
}
|
||||
|
||||
function capture(text: string, page = 1, rectsCount = 1): PdfSelectionCapture {
|
||||
return {
|
||||
kind: "pdf",
|
||||
text,
|
||||
page,
|
||||
rects: Array.from({ length: rectsCount }, (_, i) => ({
|
||||
x: 0.1,
|
||||
y: 0.2 + i * 0.05,
|
||||
width: 0.5,
|
||||
height: 0.04,
|
||||
})),
|
||||
boundingRect: { x: 0.1, y: 0.2, width: 0.5, height: 0.04 * rectsCount },
|
||||
};
|
||||
}
|
||||
|
||||
describe("createSelectors", () => {
|
||||
const text = "The quick brown fox jumps over the lazy dog near the river bank.";
|
||||
const representation = repr(text);
|
||||
|
||||
it("always includes a TextQuoteSelector with prefix and suffix from canonical text", () => {
|
||||
const sels = createSelectors(capture("brown fox"), representation);
|
||||
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector");
|
||||
expect(quote).toBeDefined();
|
||||
expect(quote!.exact).toBe("brown fox");
|
||||
expect(quote!.prefix).toBe("The quick ");
|
||||
expect(quote!.suffix).toBe(" jumps over the lazy dog near th");
|
||||
});
|
||||
|
||||
it("includes a TextPositionSelector pointing at the matched offset", () => {
|
||||
const sels = createSelectors(capture("brown fox"), representation);
|
||||
const pos = sels.find((s): s is TextPositionSelector => s.type === "TextPositionSelector");
|
||||
expect(pos).toBeDefined();
|
||||
expect(pos!.start).toBe(text.indexOf("brown fox"));
|
||||
expect(pos!.end).toBe(text.indexOf("brown fox") + "brown fox".length);
|
||||
});
|
||||
|
||||
it("includes a PdfRectSelector mirroring the capture's page and rects", () => {
|
||||
const c = capture("brown fox", 1, 2);
|
||||
const sels = createSelectors(c, representation);
|
||||
const rect = sels.find((s): s is PdfRectSelector => s.type === "PdfRectSelector");
|
||||
expect(rect).toBeDefined();
|
||||
expect(rect!.page).toBe(1);
|
||||
expect(rect!.rects).toEqual(c.rects);
|
||||
});
|
||||
|
||||
it("includes a PdfPageTextSelector when the match falls inside the capture's page range", () => {
|
||||
const sels = createSelectors(capture("brown fox"), representation);
|
||||
const pageText = sels.find((s): s is PdfPageTextSelector => s.type === "PdfPageTextSelector");
|
||||
expect(pageText).toBeDefined();
|
||||
expect(pageText!.page).toBe(1);
|
||||
expect(pageText!.start).toBe(text.indexOf("brown fox"));
|
||||
});
|
||||
|
||||
it("omits the TextPositionSelector when the quote cannot be found in canonical text", () => {
|
||||
const sels = createSelectors(capture("nonexistent phrase"), representation);
|
||||
const pos = sels.find((s) => s.type === "TextPositionSelector");
|
||||
expect(pos).toBeUndefined();
|
||||
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector");
|
||||
expect(quote!.exact).toBe("nonexistent phrase");
|
||||
expect(quote!.prefix).toBeUndefined();
|
||||
expect(quote!.suffix).toBeUndefined();
|
||||
});
|
||||
|
||||
it("clamps prefix at the start of the canonical text", () => {
|
||||
const sels = createSelectors(capture("The quick"), representation);
|
||||
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!;
|
||||
expect(quote.prefix).toBeUndefined();
|
||||
expect(quote.suffix).toBe(" brown fox jumps over the lazy d");
|
||||
});
|
||||
|
||||
it("clamps suffix at the end of the canonical text", () => {
|
||||
const sels = createSelectors(capture("river bank."), representation);
|
||||
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!;
|
||||
expect(quote.prefix).toBe("umps over the lazy dog near the ");
|
||||
expect(quote.suffix).toBeUndefined();
|
||||
});
|
||||
|
||||
it("honors a custom contextChars option", () => {
|
||||
const sels = createSelectors(capture("brown fox"), representation, { contextChars: 4 });
|
||||
const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!;
|
||||
expect(quote.prefix).toBe("ick ");
|
||||
expect(quote.suffix).toBe(" jum");
|
||||
});
|
||||
|
||||
it("prefers the on-page match when the quote appears on multiple pages", () => {
|
||||
// Two-page representation where the quote appears once per page.
|
||||
const canonical = "alpha echo bravo" + "\n\n" + "charlie echo delta";
|
||||
const rep: DocumentRepresentation = {
|
||||
id: "rep_multi" as RepresentationId,
|
||||
documentId: "doc_multi" as DocumentId,
|
||||
representationType: "pdf-text",
|
||||
contentHash: "h",
|
||||
canonicalText: canonical,
|
||||
pageMap: [
|
||||
{ page: 1, width: 100, height: 100 },
|
||||
{ page: 2, width: 100, height: 100 },
|
||||
],
|
||||
offsetMap: [
|
||||
{ page: 1, globalStart: 0, globalEnd: 18, pageLength: 18 },
|
||||
{ page: 2, globalStart: 18, globalEnd: canonical.length, pageLength: canonical.length - 18 },
|
||||
],
|
||||
generatedAt: "2026-05-25T00:00:00.000Z",
|
||||
};
|
||||
const sels = createSelectors(capture("echo", 2), rep);
|
||||
const pos = sels.find((s): s is TextPositionSelector => s.type === "TextPositionSelector")!;
|
||||
expect(pos.start).toBe(canonical.indexOf("echo", 18));
|
||||
});
|
||||
});
|
||||
157
src/anchor/selectors/create.ts
Normal file
157
src/anchor/selectors/create.ts
Normal file
@@ -0,0 +1,157 @@
|
||||
/**
|
||||
* Build the maximal `Selector[]` from a viewer's `SelectionCapture`.
|
||||
*
|
||||
* Implements the "always store all selector types that are available" rule
|
||||
* from `wiki/SharedContracts.md` §3 (selector redundancy) and the create
|
||||
* half of the `AnchorAdapter` contract in
|
||||
* `wiki/ArchitectureOverview.md` §3.3.
|
||||
*
|
||||
* Output guarantee: every returned `Selector[]` includes a
|
||||
* `TextQuoteSelector` (always) and adds `TextPositionSelector`,
|
||||
* `PdfRectSelector`, `PdfPageTextSelector` only when the underlying data
|
||||
* actually supports them. Resolvers can rely on the union being trimmed —
|
||||
* a missing selector means "not available", not "skipped".
|
||||
*/
|
||||
|
||||
import type { DocumentRepresentation } from "@shared/document";
|
||||
import { normalize } from "@shared/text/normalize";
|
||||
import type {
|
||||
PdfPageTextSelector,
|
||||
PdfRectSelector,
|
||||
Selector,
|
||||
TextPositionSelector,
|
||||
TextQuoteSelector,
|
||||
} from "@shared/selector";
|
||||
|
||||
import type { PdfSelectionCapture, SelectionCapture } from "../types";
|
||||
|
||||
/** Default characters of prefix/suffix context stored on TextQuoteSelector. */
|
||||
export const DEFAULT_CONTEXT_CHARS = 32;
|
||||
|
||||
export interface CreateSelectorsOptions {
|
||||
readonly contextChars?: number;
|
||||
}
|
||||
|
||||
export function createSelectors(
|
||||
capture: SelectionCapture,
|
||||
representation: DocumentRepresentation,
|
||||
options: CreateSelectorsOptions = {},
|
||||
): Selector[] {
|
||||
// `SelectionCapture` is a discriminated union. The DOM branch is `never`
|
||||
// in MVP, so the only runtime shape is `PdfSelectionCapture`.
|
||||
return createSelectorsFromPdfCapture(capture, representation, options);
|
||||
}
|
||||
|
||||
function createSelectorsFromPdfCapture(
|
||||
capture: PdfSelectionCapture,
|
||||
representation: DocumentRepresentation,
|
||||
options: CreateSelectorsOptions,
|
||||
): Selector[] {
|
||||
const contextChars = options.contextChars ?? DEFAULT_CONTEXT_CHARS;
|
||||
const normalizedQuote = normalize(capture.text).text;
|
||||
const out: Selector[] = [];
|
||||
|
||||
const canonicalText = representation.canonicalText ?? "";
|
||||
const positions = canonicalText.length > 0 && normalizedQuote.length > 0
|
||||
? findAllOccurrences(canonicalText, normalizedQuote)
|
||||
: [];
|
||||
|
||||
// Locate the match that falls on the capture's page (when offsetMap is
|
||||
// known); otherwise fall back to the first match. If there is no match,
|
||||
// we still emit a quote-only TextQuoteSelector so the annotation is
|
||||
// recoverable later if the representation is rebuilt.
|
||||
const pageRange = representation.offsetMap?.find((r) => r.page === capture.page);
|
||||
const matchOffset = pickMatch(positions, pageRange);
|
||||
|
||||
// 1. TextQuoteSelector — always included.
|
||||
if (normalizedQuote.length > 0) {
|
||||
const quote = matchOffset !== null
|
||||
? buildQuoteSelectorWithContext(canonicalText, matchOffset, normalizedQuote, contextChars)
|
||||
: ({ type: "TextQuoteSelector", exact: normalizedQuote } satisfies TextQuoteSelector);
|
||||
out.push(quote);
|
||||
}
|
||||
|
||||
// 2. TextPositionSelector — only when we have a unique-enough match.
|
||||
if (matchOffset !== null) {
|
||||
const pos: TextPositionSelector = {
|
||||
type: "TextPositionSelector",
|
||||
start: matchOffset,
|
||||
end: matchOffset + normalizedQuote.length,
|
||||
};
|
||||
out.push(pos);
|
||||
}
|
||||
|
||||
// 3. PdfRectSelector — straight from the capture; viewer-coordinate truth.
|
||||
if (capture.rects.length > 0) {
|
||||
const rect: PdfRectSelector = {
|
||||
type: "PdfRectSelector",
|
||||
page: capture.page,
|
||||
rects: capture.rects,
|
||||
};
|
||||
out.push(rect);
|
||||
}
|
||||
|
||||
// 4. PdfPageTextSelector — when we have offsetMap and a unique-enough match
|
||||
// that falls inside the capture's page range.
|
||||
if (matchOffset !== null && pageRange) {
|
||||
if (matchOffset >= pageRange.globalStart && matchOffset + normalizedQuote.length <= pageRange.globalEnd) {
|
||||
const pageText: PdfPageTextSelector = {
|
||||
type: "PdfPageTextSelector",
|
||||
page: capture.page,
|
||||
start: matchOffset - pageRange.globalStart,
|
||||
end: matchOffset - pageRange.globalStart + normalizedQuote.length,
|
||||
};
|
||||
out.push(pageText);
|
||||
}
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
function findAllOccurrences(haystack: string, needle: string): number[] {
|
||||
if (needle.length === 0) return [];
|
||||
const out: number[] = [];
|
||||
let from = 0;
|
||||
for (;;) {
|
||||
const idx = haystack.indexOf(needle, from);
|
||||
if (idx === -1) break;
|
||||
out.push(idx);
|
||||
from = idx + 1;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
function pickMatch(
|
||||
positions: readonly number[],
|
||||
pageRange: { globalStart: number; globalEnd: number } | undefined,
|
||||
): number | null {
|
||||
if (positions.length === 0) return null;
|
||||
if (positions.length === 1) return positions[0]!;
|
||||
if (pageRange) {
|
||||
const onPage = positions.find(
|
||||
(p) => p >= pageRange.globalStart && p < pageRange.globalEnd,
|
||||
);
|
||||
if (onPage !== undefined) return onPage;
|
||||
}
|
||||
// Multiple matches and no page hint — return the first; resolve.ts will
|
||||
// need prefix/suffix to disambiguate.
|
||||
return positions[0]!;
|
||||
}
|
||||
|
||||
function buildQuoteSelectorWithContext(
|
||||
canonicalText: string,
|
||||
matchOffset: number,
|
||||
exact: string,
|
||||
contextChars: number,
|
||||
): TextQuoteSelector {
|
||||
const prefixStart = Math.max(0, matchOffset - contextChars);
|
||||
const suffixEnd = Math.min(canonicalText.length, matchOffset + exact.length + contextChars);
|
||||
const prefix = canonicalText.slice(prefixStart, matchOffset);
|
||||
const suffix = canonicalText.slice(matchOffset + exact.length, suffixEnd);
|
||||
return {
|
||||
type: "TextQuoteSelector",
|
||||
exact,
|
||||
...(prefix.length > 0 ? { prefix } : {}),
|
||||
...(suffix.length > 0 ? { suffix } : {}),
|
||||
};
|
||||
}
|
||||
6
src/anchor/selectors/index.ts
Normal file
6
src/anchor/selectors/index.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
export {
|
||||
createSelectors,
|
||||
DEFAULT_CONTEXT_CHARS,
|
||||
type CreateSelectorsOptions,
|
||||
} from "./create";
|
||||
export { resolveSelectors } from "./resolve";
|
||||
137
src/anchor/selectors/resolve.test.ts
Normal file
137
src/anchor/selectors/resolve.test.ts
Normal file
@@ -0,0 +1,137 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import type { DocumentRepresentation } from "@shared/document";
|
||||
import type { DocumentId, RepresentationId } from "@shared/ids";
|
||||
import type { Selector } from "@shared/selector";
|
||||
import { resolveSelectors } from "./resolve";
|
||||
|
||||
function repr(canonicalText: string, pages = 1): DocumentRepresentation {
|
||||
const segmentLen = pages === 1
|
||||
? canonicalText.length
|
||||
: Math.floor(canonicalText.length / pages);
|
||||
const offsetMap = [];
|
||||
for (let i = 0; i < pages; i++) {
|
||||
const start = i * segmentLen;
|
||||
const end = i === pages - 1 ? canonicalText.length : start + segmentLen;
|
||||
offsetMap.push({ page: i + 1, globalStart: start, globalEnd: end, pageLength: end - start });
|
||||
}
|
||||
return {
|
||||
id: "rep_test" as RepresentationId,
|
||||
documentId: "doc_test" as DocumentId,
|
||||
representationType: "pdf-text",
|
||||
contentHash: "test",
|
||||
canonicalText,
|
||||
pageMap: Array.from({ length: pages }, (_, i) => ({ page: i + 1, width: 595, height: 842 })),
|
||||
offsetMap,
|
||||
generatedAt: "2026-05-25T00:00:00.000Z",
|
||||
};
|
||||
}
|
||||
|
||||
describe("resolveSelectors", () => {
|
||||
const text = "The quick brown fox jumps over the lazy dog.";
|
||||
const representation = repr(text);
|
||||
const brownFoxStart = text.indexOf("brown fox");
|
||||
const brownFoxEnd = brownFoxStart + "brown fox".length;
|
||||
|
||||
it("returns 1.0 confidence when position and quote agree exactly", () => {
|
||||
const selectors: Selector[] = [
|
||||
{ type: "TextPositionSelector", start: brownFoxStart, end: brownFoxEnd },
|
||||
{ type: "TextQuoteSelector", exact: "brown fox" },
|
||||
];
|
||||
const r = resolveSelectors(selectors, representation);
|
||||
expect(r.status).toBe("resolved");
|
||||
expect(r.confidence).toBe(1.0);
|
||||
expect(r.candidates[0]?.textPosition).toEqual({ start: brownFoxStart, end: brownFoxEnd });
|
||||
expect(r.candidates[0]?.page).toBe(1);
|
||||
expect(r.usedSelectorTypes).toEqual(["TextPositionSelector", "TextQuoteSelector"]);
|
||||
});
|
||||
|
||||
it("falls back to quote search when position is stale, and records a warning", () => {
|
||||
const selectors: Selector[] = [
|
||||
{ type: "TextPositionSelector", start: 0, end: 9 }, // "The quick"
|
||||
{ type: "TextQuoteSelector", exact: "brown fox" },
|
||||
];
|
||||
const r = resolveSelectors(selectors, representation);
|
||||
expect(r.status).toBe("resolved");
|
||||
expect(r.confidence).toBe(0.95);
|
||||
expect(r.candidates[0]?.textPosition).toEqual({ start: brownFoxStart, end: brownFoxEnd });
|
||||
expect(r.warnings?.[0]).toMatch(/did not match/);
|
||||
expect(r.usedSelectorTypes).toEqual(["TextQuoteSelector"]);
|
||||
});
|
||||
|
||||
it("returns 0.85 for a position-only selector with no quote to verify", () => {
|
||||
const selectors: Selector[] = [
|
||||
{ type: "TextPositionSelector", start: brownFoxStart, end: brownFoxEnd },
|
||||
];
|
||||
const r = resolveSelectors(selectors, representation);
|
||||
expect(r.status).toBe("resolved");
|
||||
expect(r.confidence).toBe(0.85);
|
||||
});
|
||||
|
||||
it("returns 0.95 when only TextQuoteSelector is present and the quote is unique", () => {
|
||||
const r = resolveSelectors(
|
||||
[{ type: "TextQuoteSelector", exact: "brown fox" }],
|
||||
representation,
|
||||
);
|
||||
expect(r.status).toBe("resolved");
|
||||
expect(r.confidence).toBe(0.95);
|
||||
});
|
||||
|
||||
it("returns 0.9 when a duplicated quote is disambiguated by prefix/suffix", () => {
|
||||
const dup = "alpha echo bravo charlie echo delta";
|
||||
const r = resolveSelectors(
|
||||
[{ type: "TextQuoteSelector", exact: "echo", prefix: "charlie ", suffix: " delta" }],
|
||||
repr(dup),
|
||||
);
|
||||
expect(r.status).toBe("resolved");
|
||||
expect(r.confidence).toBe(0.9);
|
||||
expect(r.candidates[0]?.textPosition?.start).toBe(dup.indexOf("echo", 10));
|
||||
});
|
||||
|
||||
it("returns ambiguous when a duplicated quote cannot be disambiguated", () => {
|
||||
const dup = "echo and echo";
|
||||
const r = resolveSelectors(
|
||||
[{ type: "TextQuoteSelector", exact: "echo" }],
|
||||
repr(dup),
|
||||
);
|
||||
expect(r.status).toBe("ambiguous");
|
||||
expect(r.confidence).toBe(0.5);
|
||||
});
|
||||
|
||||
it("falls back to PdfPageTextSelector via the OffsetMap", () => {
|
||||
// Single page, "brown fox" at offset 10..19.
|
||||
const r = resolveSelectors(
|
||||
[{ type: "PdfPageTextSelector", page: 1, start: brownFoxStart, end: brownFoxEnd }],
|
||||
representation,
|
||||
);
|
||||
expect(r.status).toBe("resolved");
|
||||
expect(r.confidence).toBe(0.8);
|
||||
expect(r.candidates[0]?.textPosition).toEqual({ start: brownFoxStart, end: brownFoxEnd });
|
||||
expect(r.candidates[0]?.page).toBe(1);
|
||||
});
|
||||
|
||||
it("falls back to PdfRectSelector with page+rects only at 0.7 confidence", () => {
|
||||
const r = resolveSelectors(
|
||||
[{
|
||||
type: "PdfRectSelector",
|
||||
page: 2,
|
||||
rects: [{ x: 0.1, y: 0.2, width: 0.3, height: 0.04 }],
|
||||
}],
|
||||
repr(text, 1),
|
||||
);
|
||||
expect(r.status).toBe("resolved");
|
||||
expect(r.confidence).toBe(0.7);
|
||||
expect(r.candidates[0]?.page).toBe(2);
|
||||
expect(r.candidates[0]?.textPosition).toBeUndefined();
|
||||
expect(r.candidates[0]?.rects).toHaveLength(1);
|
||||
});
|
||||
|
||||
it("returns unresolved when nothing matches", () => {
|
||||
const r = resolveSelectors(
|
||||
[{ type: "TextQuoteSelector", exact: "missing string" }],
|
||||
representation,
|
||||
);
|
||||
expect(r.status).toBe("unresolved");
|
||||
expect(r.confidence).toBe(0);
|
||||
expect(r.candidates).toEqual([]);
|
||||
});
|
||||
});
|
||||
260
src/anchor/selectors/resolve.ts
Normal file
260
src/anchor/selectors/resolve.ts
Normal file
@@ -0,0 +1,260 @@
|
||||
/**
|
||||
* Resolve a `Selector[]` against a `DocumentRepresentation`.
|
||||
*
|
||||
* Implements the resolution strategy from `wiki/ArchitectureOverview.md` §7,
|
||||
* MVP-trimmed:
|
||||
*
|
||||
* 1. Try `TextPositionSelector` (cheapest — direct slice).
|
||||
* 2. Verify with `TextQuoteSelector` at that position.
|
||||
* 3. Try `TextQuoteSelector` on its own. If multiple matches, disambiguate
|
||||
* by prefix/suffix.
|
||||
* 4. Try `PdfPageTextSelector` (page-local offsets through the OffsetMap).
|
||||
* 5. Fall back to `PdfRectSelector` for a page+rects-only target.
|
||||
* 6. Return `unresolved` if nothing above succeeds.
|
||||
*
|
||||
* Fuzzy matching is out of scope here; a later workplan owns it.
|
||||
*
|
||||
* Confidence ladder (0..1):
|
||||
* 1.00 — TextPosition + TextQuote agree exactly
|
||||
* 0.95 — TextQuote unique match (no position to cross-check)
|
||||
* 0.90 — TextQuote disambiguated by prefix/suffix
|
||||
* 0.85 — TextPosition only (no quote to cross-check)
|
||||
* 0.80 — PdfPageTextSelector resolved via OffsetMap
|
||||
* 0.70 — PdfRectSelector only (page+rects, no text verification)
|
||||
*/
|
||||
|
||||
import type { DocumentRepresentation } from "@shared/document";
|
||||
import type {
|
||||
PdfPageTextSelector,
|
||||
PdfRectSelector,
|
||||
Selector,
|
||||
SelectorType,
|
||||
TextPositionSelector,
|
||||
TextQuoteSelector,
|
||||
} from "@shared/selector";
|
||||
|
||||
import type { AnchorResolution, ResolvedAnchorTarget } from "../types";
|
||||
|
||||
export function resolveSelectors(
|
||||
selectors: readonly Selector[],
|
||||
representation: DocumentRepresentation,
|
||||
): AnchorResolution {
|
||||
const canonicalText = representation.canonicalText ?? "";
|
||||
const offsetMap = representation.offsetMap ?? [];
|
||||
const representationId = representation.id;
|
||||
|
||||
const byType = indexByType(selectors);
|
||||
const used: SelectorType[] = [];
|
||||
const warnings: string[] = [];
|
||||
|
||||
// 1 & 2. Try TextPositionSelector, verify with TextQuoteSelector.
|
||||
if (byType.TextPositionSelector && canonicalText.length > 0) {
|
||||
const pos = byType.TextPositionSelector;
|
||||
const slice = sliceSafely(canonicalText, pos.start, pos.end);
|
||||
if (slice !== null) {
|
||||
const quote = byType.TextQuoteSelector;
|
||||
if (quote) {
|
||||
if (slice === quote.exact) {
|
||||
used.push("TextPositionSelector", "TextQuoteSelector");
|
||||
return resolved(
|
||||
{ representationId, textPosition: { start: pos.start, end: pos.end }, ...pageFor(pos, offsetMap) },
|
||||
1.0,
|
||||
used,
|
||||
warnings,
|
||||
);
|
||||
}
|
||||
warnings.push(
|
||||
"TextPositionSelector slice did not match TextQuoteSelector.exact; falling back to quote search.",
|
||||
);
|
||||
} else {
|
||||
// Position with no quote to verify — accept at lower confidence.
|
||||
used.push("TextPositionSelector");
|
||||
return resolved(
|
||||
{ representationId, textPosition: { start: pos.start, end: pos.end }, ...pageFor(pos, offsetMap) },
|
||||
0.85,
|
||||
used,
|
||||
warnings,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 3. TextQuoteSelector on its own (or after the position fallback above).
|
||||
if (byType.TextQuoteSelector && canonicalText.length > 0) {
|
||||
const quoteResult = resolveByQuote(canonicalText, byType.TextQuoteSelector);
|
||||
if (quoteResult) {
|
||||
used.push("TextQuoteSelector");
|
||||
return resolved(
|
||||
{
|
||||
representationId,
|
||||
textPosition: { start: quoteResult.offset, end: quoteResult.offset + byType.TextQuoteSelector.exact.length },
|
||||
...pageFor({ start: quoteResult.offset, end: quoteResult.offset + byType.TextQuoteSelector.exact.length }, offsetMap),
|
||||
},
|
||||
quoteResult.confidence,
|
||||
used,
|
||||
warnings,
|
||||
quoteResult.status,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// 4. PdfPageTextSelector through OffsetMap.
|
||||
if (byType.PdfPageTextSelector && offsetMap.length > 0) {
|
||||
const pageText = byType.PdfPageTextSelector;
|
||||
const range = offsetMap.find((r) => r.page === pageText.page);
|
||||
if (range && pageText.start >= 0 && pageText.end <= range.pageLength && pageText.start < pageText.end) {
|
||||
const globalStart = range.globalStart + pageText.start;
|
||||
const globalEnd = range.globalStart + pageText.end;
|
||||
used.push("PdfPageTextSelector");
|
||||
return resolved(
|
||||
{
|
||||
representationId,
|
||||
page: pageText.page,
|
||||
textPosition: { start: globalStart, end: globalEnd },
|
||||
},
|
||||
0.8,
|
||||
used,
|
||||
warnings,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// 5. PdfRectSelector fallback (no text verification possible).
|
||||
if (byType.PdfRectSelector) {
|
||||
const rect = byType.PdfRectSelector;
|
||||
used.push("PdfRectSelector");
|
||||
return resolved(
|
||||
{ representationId, page: rect.page, rects: rect.rects },
|
||||
0.7,
|
||||
used,
|
||||
warnings,
|
||||
);
|
||||
}
|
||||
|
||||
return unresolved(warnings);
|
||||
}
|
||||
|
||||
interface QuoteResolutionResult {
|
||||
readonly offset: number;
|
||||
readonly confidence: number;
|
||||
readonly status: "resolved" | "ambiguous";
|
||||
}
|
||||
|
||||
function resolveByQuote(canonicalText: string, quote: TextQuoteSelector): QuoteResolutionResult | null {
|
||||
const positions = findAllOccurrences(canonicalText, quote.exact);
|
||||
if (positions.length === 0) return null;
|
||||
if (positions.length === 1) {
|
||||
return { offset: positions[0]!, confidence: 0.95, status: "resolved" };
|
||||
}
|
||||
// Multiple matches — try to disambiguate by prefix/suffix.
|
||||
const filtered = positions.filter((p) => prefixSuffixMatches(canonicalText, p, quote));
|
||||
if (filtered.length === 1) {
|
||||
return { offset: filtered[0]!, confidence: 0.9, status: "resolved" };
|
||||
}
|
||||
if (filtered.length > 1) {
|
||||
return { offset: filtered[0]!, confidence: 0.5, status: "ambiguous" };
|
||||
}
|
||||
// No prefix/suffix info or no matches with context — return ambiguous on first.
|
||||
return { offset: positions[0]!, confidence: 0.5, status: "ambiguous" };
|
||||
}
|
||||
|
||||
function prefixSuffixMatches(
|
||||
canonicalText: string,
|
||||
offset: number,
|
||||
quote: TextQuoteSelector,
|
||||
): boolean {
|
||||
if (quote.prefix !== undefined) {
|
||||
const prefixEnd = offset;
|
||||
const prefixStart = Math.max(0, prefixEnd - quote.prefix.length);
|
||||
const actualPrefix = canonicalText.slice(prefixStart, prefixEnd);
|
||||
if (!actualPrefix.endsWith(quote.prefix)) return false;
|
||||
}
|
||||
if (quote.suffix !== undefined) {
|
||||
const suffixStart = offset + quote.exact.length;
|
||||
const suffixEnd = Math.min(canonicalText.length, suffixStart + quote.suffix.length);
|
||||
const actualSuffix = canonicalText.slice(suffixStart, suffixEnd);
|
||||
if (!actualSuffix.startsWith(quote.suffix)) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
interface SelectorIndex {
|
||||
TextQuoteSelector?: TextQuoteSelector;
|
||||
TextPositionSelector?: TextPositionSelector;
|
||||
PdfRectSelector?: PdfRectSelector;
|
||||
PdfPageTextSelector?: PdfPageTextSelector;
|
||||
}
|
||||
|
||||
function indexByType(selectors: readonly Selector[]): SelectorIndex {
|
||||
const idx: SelectorIndex = {};
|
||||
for (const s of selectors) {
|
||||
switch (s.type) {
|
||||
case "TextQuoteSelector":
|
||||
idx.TextQuoteSelector = s;
|
||||
break;
|
||||
case "TextPositionSelector":
|
||||
idx.TextPositionSelector = s;
|
||||
break;
|
||||
case "PdfRectSelector":
|
||||
idx.PdfRectSelector = s;
|
||||
break;
|
||||
case "PdfPageTextSelector":
|
||||
idx.PdfPageTextSelector = s;
|
||||
break;
|
||||
}
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
|
||||
function sliceSafely(text: string, start: number, end: number): string | null {
|
||||
if (start < 0 || end > text.length || start >= end) return null;
|
||||
return text.slice(start, end);
|
||||
}
|
||||
|
||||
function pageFor(
|
||||
span: { start: number; end: number },
|
||||
offsetMap: readonly { page: number; globalStart: number; globalEnd: number }[],
|
||||
): { page?: number } {
|
||||
if (offsetMap.length === 0) return {};
|
||||
const range = offsetMap.find((r) => span.start >= r.globalStart && span.end <= r.globalEnd);
|
||||
return range ? { page: range.page } : {};
|
||||
}
|
||||
|
||||
function findAllOccurrences(haystack: string, needle: string): number[] {
|
||||
if (needle.length === 0) return [];
|
||||
const out: number[] = [];
|
||||
let from = 0;
|
||||
for (;;) {
|
||||
const idx = haystack.indexOf(needle, from);
|
||||
if (idx === -1) break;
|
||||
out.push(idx);
|
||||
from = idx + 1;
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
function resolved(
|
||||
target: ResolvedAnchorTarget,
|
||||
confidence: number,
|
||||
used: readonly SelectorType[],
|
||||
warnings: readonly string[],
|
||||
status: "resolved" | "ambiguous" = "resolved",
|
||||
): AnchorResolution {
|
||||
return {
|
||||
status,
|
||||
confidence,
|
||||
candidates: [target],
|
||||
usedSelectorTypes: used,
|
||||
...(warnings.length > 0 ? { warnings } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
function unresolved(warnings: readonly string[]): AnchorResolution {
|
||||
return {
|
||||
status: "unresolved",
|
||||
confidence: 0,
|
||||
candidates: [],
|
||||
usedSelectorTypes: [],
|
||||
...(warnings.length > 0 ? { warnings } : {}),
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user