import { describe, expect, it } from "vitest"; import type { DocumentRepresentation } from "@shared/document"; import type { DocumentId, RepresentationId } from "@shared/ids"; import type { PdfPageTextSelector, PdfRectSelector, TextPositionSelector, TextQuoteSelector, } from "@shared/selector"; import { createSelectors } from "./create"; import type { PdfSelectionCapture } from "../types"; function repr(canonicalText: string): DocumentRepresentation { const pageLength = canonicalText.length; return { id: "rep_test" as RepresentationId, documentId: "doc_test" as DocumentId, representationType: "pdf-text", contentHash: "test", canonicalText, pageMap: [{ page: 1, width: 595, height: 842 }], offsetMap: [ { page: 1, globalStart: 0, globalEnd: pageLength, pageLength }, ], generatedAt: "2026-05-25T00:00:00.000Z", }; } function capture(text: string, page = 1, rectsCount = 1): PdfSelectionCapture { return { kind: "pdf", text, page, rects: Array.from({ length: rectsCount }, (_, i) => ({ x: 0.1, y: 0.2 + i * 0.05, width: 0.5, height: 0.04, })), boundingRect: { x: 0.1, y: 0.2, width: 0.5, height: 0.04 * rectsCount }, }; } describe("createSelectors", () => { const text = "The quick brown fox jumps over the lazy dog near the river bank."; const representation = repr(text); it("always includes a TextQuoteSelector with prefix and suffix from canonical text", () => { const sels = createSelectors(capture("brown fox"), representation); const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector"); expect(quote).toBeDefined(); expect(quote!.exact).toBe("brown fox"); expect(quote!.prefix).toBe("The quick "); expect(quote!.suffix).toBe(" jumps over the lazy dog near th"); }); it("includes a TextPositionSelector pointing at the matched offset", () => { const sels = createSelectors(capture("brown fox"), representation); const pos = sels.find((s): s is TextPositionSelector => s.type === "TextPositionSelector"); expect(pos).toBeDefined(); expect(pos!.start).toBe(text.indexOf("brown fox")); expect(pos!.end).toBe(text.indexOf("brown fox") + "brown fox".length); }); it("includes a PdfRectSelector mirroring the capture's page and rects", () => { const c = capture("brown fox", 1, 2); const sels = createSelectors(c, representation); const rect = sels.find((s): s is PdfRectSelector => s.type === "PdfRectSelector"); expect(rect).toBeDefined(); expect(rect!.page).toBe(1); expect(rect!.rects).toEqual(c.rects); }); it("includes a PdfPageTextSelector when the match falls inside the capture's page range", () => { const sels = createSelectors(capture("brown fox"), representation); const pageText = sels.find((s): s is PdfPageTextSelector => s.type === "PdfPageTextSelector"); expect(pageText).toBeDefined(); expect(pageText!.page).toBe(1); expect(pageText!.start).toBe(text.indexOf("brown fox")); }); it("omits the TextPositionSelector when the quote cannot be found in canonical text", () => { const sels = createSelectors(capture("nonexistent phrase"), representation); const pos = sels.find((s) => s.type === "TextPositionSelector"); expect(pos).toBeUndefined(); const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector"); expect(quote!.exact).toBe("nonexistent phrase"); expect(quote!.prefix).toBeUndefined(); expect(quote!.suffix).toBeUndefined(); }); it("clamps prefix at the start of the canonical text", () => { const sels = createSelectors(capture("The quick"), representation); const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!; expect(quote.prefix).toBeUndefined(); expect(quote.suffix).toBe(" brown fox jumps over the lazy d"); }); it("clamps suffix at the end of the canonical text", () => { const sels = createSelectors(capture("river bank."), representation); const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!; expect(quote.prefix).toBe("umps over the lazy dog near the "); expect(quote.suffix).toBeUndefined(); }); it("honors a custom contextChars option", () => { const sels = createSelectors(capture("brown fox"), representation, { contextChars: 4 }); const quote = sels.find((s): s is TextQuoteSelector => s.type === "TextQuoteSelector")!; expect(quote.prefix).toBe("ick "); expect(quote.suffix).toBe(" jum"); }); it("prefers the on-page match when the quote appears on multiple pages", () => { // Two-page representation where the quote appears once per page. const canonical = "alpha echo bravo" + "\n\n" + "charlie echo delta"; const rep: DocumentRepresentation = { id: "rep_multi" as RepresentationId, documentId: "doc_multi" as DocumentId, representationType: "pdf-text", contentHash: "h", canonicalText: canonical, pageMap: [ { page: 1, width: 100, height: 100 }, { page: 2, width: 100, height: 100 }, ], offsetMap: [ { page: 1, globalStart: 0, globalEnd: 18, pageLength: 18 }, { page: 2, globalStart: 18, globalEnd: canonical.length, pageLength: canonical.length - 18 }, ], generatedAt: "2026-05-25T00:00:00.000Z", }; const sels = createSelectors(capture("echo", 2), rep); const pos = sels.find((s): s is TextPositionSelector => s.type === "TextPositionSelector")!; expect(pos.start).toBe(canonical.indexOf("echo", 18)); }); });