{ "_schema_version": 1, "_description": "PDF fixture corpus for citation-evidence selector tests. Each entry binds a stable id (used by test code) to a file path, page count, and a verbatim known-good quote with its 1-indexed physical PDF page number. The quote is short, unique within the document, and chosen to round-trip cleanly through the canonical text normalizer.", "_provenance": "Page counts and quotes extracted on 2026-05-24 by reading each PDF directly, then re-verified on 2026-05-25 against the PDF.js v4 text extractor used by src/source/pdf/extract.ts. The Betriebskosten file is a scanned/handwritten form with noisy OCR text — its known-good quote was updated 2026-05-25 from 'Ich bitte um Überweisung auf das Konto bei' to 'Auf der Rückseite finden Sie Ihre Abrechnung' because PDF.js drops the capital-Ü in the original (the lowercase-ü in 'Rückseite' survives, so the new quote still exercises the umlaut code path).", "fixtures": [ { "id": "betriebskosten-2024", "filename": "031-Kemal Güldag Betriebskosten 2024.pdf", "description": "German Betriebskostenabrechnung (utility-cost statement) for a Seeheim apartment — scanned cover letter + filled-in Abrechnung form. OCR-noisy text and handwritten field values. Useful for stress-testing canonical normalization and selector resolution on imperfect extraction.", "page_count": 2, "known_good_quote": "Auf der Rückseite finden Sie Ihre Abrechnung", "known_good_quote_page": 1, "characteristics": ["german", "umlauts", "scanned", "ocr-noisy", "form", "handwritten"] }, { "id": "brief-trennung-angebot", "filename": "061-260215-brief-trennungRoxanaAngebot_v1_final.pdf", "description": "German correspondence — three-page settlement proposal letter with bullet lists, embedded amounts, and a signature on the final page. Single-column prose; representative of typical legal/personal letters.", "page_count": 3, "known_good_quote": "Dieser Vorschlag ist befristet bis zum 31.03.2026", "known_good_quote_page": 3, "characteristics": ["german", "umlauts", "single-column", "prose", "multi-page", "bullet-lists"] }, { "id": "sonderkosten", "filename": "063-26.01_Sonderkosten.pdf", "description": "German Sonderkosten ledger — tabular expense statement across multiple years and people. Three pages of column-heavy tables; representative of spreadsheet-exported PDFs where text extraction order is column-driven.", "page_count": 3, "known_good_quote": "Einzahlung Bernd 23.09.24", "known_good_quote_page": 2, "characteristics": ["german", "tables", "spreadsheet-export", "multi-column", "amounts"] }, { "id": "vollstaendigkeitserklaerung-2024", "filename": "61595286_Vollständigkeitserklärung_2024.pdf", "description": "Single-page German Vollständigkeitserklärung tax-prep form (VLH). Dense form with checkboxes, labelled fields, and small-print legal text — a good test of selector creation on form-heavy layouts.", "page_count": 1, "known_good_quote": "Mitglied beim Lohnsteuerhilfeverein Vereinigte Lohnsteuerhilfe e.V.", "known_good_quote_page": 1, "characteristics": ["german", "umlauts", "form", "checkboxes", "single-page", "dense"] }, { "id": "aufnahmeschein-naturfriedhof", "filename": "Aufnahmeschein Naturfriedhof.pdf", "description": "Three-page German admission form (Aufnahmeschein) for the Mühltal natural cemetery: fillable form (p1-2) plus an excerpt of the Friedhofsordnung statute (p3). Tests selectors that must distinguish form labels from underline-fields and prose.", "page_count": 3, "known_good_quote": "Mehrere Verpflichtete haften als Gesamtschuldner", "known_good_quote_page": 3, "characteristics": ["german", "umlauts", "form", "statute", "mixed-layout"] }, { "id": "fristsetzung-bezifferung", "filename": "Fristsetzung zur Bezifferung GÜ an Gegenseite 3 Wochen.pdf", "description": "Single-page formal court letter from the Amtsgericht Darmstadt — header block, addressed block, a one-sentence ruling, and a signature block. Excellent for clean selector round-trip tests.", "page_count": 1, "known_good_quote": "wird der Antragsgegnerin eine Frist von 3 Wochen zur Bezifferung gesetzt", "known_good_quote_page": 1, "characteristics": ["german", "umlauts", "legal", "single-page", "clean-text"] }, { "id": "zeugnisspruche-klasse-7-8", "filename": "Zeugnissprüche_Klasse_7_8.pdf", "description": "Long-form reference document (29 PDF pages): title page + 28 pages of curated quotes for German school-year reports, each with author, dates, and short biography. Multi-language inclusions (English, Spanish, Greek). Ideal for cross-page selector and heading-hierarchy tests.", "page_count": 29, "known_good_quote": "Der Friede der Welt beginnt in den Herzen der Menschen", "known_good_quote_page": 2, "characteristics": ["german", "umlauts", "long-form", "multi-language", "hierarchy", "structured"] } ] }