Implement CE-WP-0001 Foundations: TS scaffold, lint boundaries, normalize v1, fixtures

T01 Toolchain — vite + pnpm 9.15 + React 18 + strict TS (ADR-0001). T02 Folder layout — src/{shared,engine,anchor,source,binder,work,app}/ mirroring the future subsystem split, with path aliases. T03 Boundary lint — eslint-plugin-boundaries enforcing the dependency edges from wiki/DependencyMap.md §4; verified by a violating fixture. T04 Canonical normalization v1 — src/shared/text/normalize.ts with NORMALIZE_VERSION=1; 10/10 vitest covering ligatures, CRLF, soft hyphens (including line-break reassembly), mixed whitespace. T05 PDF fixture corpus — 7 user-supplied German PDFs in fixtures/pdfs/ (gitignored binaries) plus a manifest with verbatim known-good quotes and page counts, ready for CE-WP-0002 selector tests. T06 README upgrade — umbrella README points at wiki/docs/workplans and documents the dev workflow. T07 ADR-0002..0006 stubs in docs/decisions/. Toolchain end-to-end: pnpm install + lint + typecheck + test all green. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-25 00:13:03 +02:00
parent 707620adfb
commit 2f25f99cae
32 changed files with 4756 additions and 9 deletions
--- a/src/shared/README.md
+++ b/src/shared/README.md
@@ -0,0 +1,8 @@
+# `src/shared/` — vocabulary, types, pure helpers
+
+Future home: `citation-engine` (the shared types and contracts half of it).
+Owns: `Document`, `Selector`, `Annotation`, `EvidenceItem`, `EvidenceLink`,
+`EvidenceSet`, state enums, branded IDs, canonical text normalization.
+
+May import from: nothing internal. Leaf node of the dependency graph
+(`wiki/DependencyMap.md` §4).
--- a/src/shared/index.ts
+++ b/src/shared/index.ts
@@ -0,0 +1 @@
+export {};
--- a/src/shared/text/normalize.test.ts
+++ b/src/shared/text/normalize.test.ts
@@ -0,0 +1,56 @@
+import { describe, expect, it } from "vitest";
+import { NORMALIZE_VERSION, normalize } from "./normalize.js";
+
+describe("normalize (NORMALIZE_VERSION=1)", () => {
+  it("returns the version constant alongside the text", () => {
+    const out = normalize("hello");
+    expect(out.version).toBe(NORMALIZE_VERSION);
+    expect(out.text).toBe("hello");
+  });
+
+  it("applies Unicode NFC composition", () => {
+    // "é" decomposed (e + combining acute) vs precomposed.
+    const decomposed = "café";
+    const precomposed = "café";
+    expect(normalize(decomposed).text).toBe(precomposed);
+  });
+
+  it("normalizes CRLF and CR line endings to LF", () => {
+    expect(normalize("a\r\nb\rc").text).toBe("a\nb\nc");
+  });
+
+  it("collapses horizontal whitespace runs to a single space", () => {
+    expect(normalize("a    b\t\tc d").text).toBe("a b c d");
+  });
+
+  it("preserves paragraph boundaries but collapses 3+ blank lines to one", () => {
+    const input = "para one\n\n\n\npara two\n\npara three";
+    expect(normalize(input).text).toBe("para one\n\npara two\n\npara three");
+  });
+
+  it("strips soft hyphens (German line-broken word reassembly)", () => {
+    // German "Donaudampfschiff" line-broken with soft hyphens.
+    expect(normalize("Donaudampfschiff").text).toBe(
+      "Donaudampfschiff",
+    );
+  });
+
+  it("strips soft hyphens that span a newline ('word-\\nfragment' → 'wordfragment')", () => {
+    expect(normalize("word\nfragment").text).toBe("wordfragment");
+  });
+
+  it("does not mangle ligatures (preserves the round-trip)", () => {
+    // The ligature "ﬁ" (U+FB01) is left as-is — NFC does NOT decompose it.
+    // Test documents that current behavior so a future change is intentional.
+    expect(normalize("efﬁcient").text).toBe("efﬁcient");
+  });
+
+  it("handles a mixed-whitespace paragraph realistically", () => {
+    const input = "  First   line  \r\n   Second line.\r\n\r\n\r\nNext para.  ";
+    expect(normalize(input).text).toBe("First line\nSecond line.\n\nNext para.");
+  });
+
+  it("returns an empty string for whitespace-only input", () => {
+    expect(normalize("   \n\n  \t  ").text).toBe("");
+  });
+});
--- a/src/shared/text/normalize.ts
+++ b/src/shared/text/normalize.ts
@@ -0,0 +1,49 @@
+// Canonical text normalization for selectors and stored quotes.
+// Contract: wiki/SharedContracts.md §6.
+//
+// IMPORTANT: NORMALIZE_VERSION is stored on every Annotation. Bumping it is a
+// migration event — old selectors must be re-resolved against re-normalized
+// text before the new version becomes the default.
+
+export const NORMALIZE_VERSION = 1;
+
+// Soft hyphen (U+00AD), optionally followed by a single \n so that a PDF-
+// extracted "word\nfragment" reassembles to "wordfragment" rather than
+// leaving a stray line break in the middle of a hyphenated word.
+const SOFT_HYPHEN_AT_BREAK = /\n?/g;
+
+// Horizontal whitespace = any \s except \n and \r. The double-negation
+// [^\S\r\n] is the idiomatic regex: \S is "not whitespace", so
+// "not (not-whitespace or line-ending)" = "whitespace that is not a newline".
+// Covers space, tab, NBSP, narrow NBSP, em-space, all Zs general-category.
+const HORIZONTAL_WHITESPACE_RUN = /[^\S\r\n]+/g;
+
+// 3+ newlines collapse to exactly two (one paragraph boundary).
+const PARAGRAPH_RUN = /\n{3,}/g;
+
+export function normalize(input: string): { text: string; version: number } {
+  // 1. Unicode NFC.
+  let text = input.normalize("NFC");
+
+  // 2. Normalize line endings: CRLF and CR -> LF.
+  text = text.replace(/\r\n?/g, "\n");
+
+  // 4. Strip soft hyphens (U+00AD) — including the line break that follows
+  //    one — so PDF line-broken hyphenations reassemble. Done before
+  //    horizontal collapse so no stray space remains.
+  text = text.replace(SOFT_HYPHEN_AT_BREAK, "");
+
+  // 3. Collapse horizontal whitespace runs to a single space.
+  text = text.replace(HORIZONTAL_WHITESPACE_RUN, " ");
+
+  // 5. Preserve paragraph boundaries (\n\n); collapse 3+ blank lines to 2.
+  text = text.replace(PARAGRAPH_RUN, "\n\n");
+
+  // Trim line-edge whitespace left over after horizontal collapse.
+  text = text.replace(/ +\n/g, "\n").replace(/\n +/g, "\n");
+
+  // Trim leading/trailing whitespace from the whole document.
+  text = text.trim();
+
+  return { text, version: NORMALIZE_VERSION };
+}