Implement CE-WP-0001 Foundations: TS scaffold, lint boundaries, normalize v1, fixtures

T01 Toolchain — vite + pnpm 9.15 + React 18 + strict TS (ADR-0001).
T02 Folder layout — src/{shared,engine,anchor,source,binder,work,app}/
    mirroring the future subsystem split, with path aliases.
T03 Boundary lint — eslint-plugin-boundaries enforcing the dependency
    edges from wiki/DependencyMap.md §4; verified by a violating fixture.
T04 Canonical normalization v1 — src/shared/text/normalize.ts with
    NORMALIZE_VERSION=1; 10/10 vitest covering ligatures, CRLF, soft
    hyphens (including line-break reassembly), mixed whitespace.
T05 PDF fixture corpus — 7 user-supplied German PDFs in fixtures/pdfs/
    (gitignored binaries) plus a manifest with verbatim known-good
    quotes and page counts, ready for CE-WP-0002 selector tests.
T06 README upgrade — umbrella README points at wiki/docs/workplans
    and documents the dev workflow.
T07 ADR-0002..0006 stubs in docs/decisions/.

Toolchain end-to-end: pnpm install + lint + typecheck + test all green.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-25 00:13:03 +02:00
parent 707620adfb
commit 2f25f99cae
32 changed files with 4756 additions and 9 deletions

8
src/shared/README.md Normal file
View File

@@ -0,0 +1,8 @@
# `src/shared/` — vocabulary, types, pure helpers
Future home: `citation-engine` (the shared types and contracts half of it).
Owns: `Document`, `Selector`, `Annotation`, `EvidenceItem`, `EvidenceLink`,
`EvidenceSet`, state enums, branded IDs, canonical text normalization.
May import from: nothing internal. Leaf node of the dependency graph
(`wiki/DependencyMap.md` §4).

1
src/shared/index.ts Normal file
View File

@@ -0,0 +1 @@
export {};

View File

@@ -0,0 +1,56 @@
import { describe, expect, it } from "vitest";
import { NORMALIZE_VERSION, normalize } from "./normalize.js";
describe("normalize (NORMALIZE_VERSION=1)", () => {
it("returns the version constant alongside the text", () => {
const out = normalize("hello");
expect(out.version).toBe(NORMALIZE_VERSION);
expect(out.text).toBe("hello");
});
it("applies Unicode NFC composition", () => {
// "é" decomposed (e + combining acute) vs precomposed.
const decomposed = "café";
const precomposed = "café";
expect(normalize(decomposed).text).toBe(precomposed);
});
it("normalizes CRLF and CR line endings to LF", () => {
expect(normalize("a\r\nb\rc").text).toBe("a\nb\nc");
});
it("collapses horizontal whitespace runs to a single space", () => {
expect(normalize("a b\t\tc d").text).toBe("a b c d");
});
it("preserves paragraph boundaries but collapses 3+ blank lines to one", () => {
const input = "para one\n\n\n\npara two\n\npara three";
expect(normalize(input).text).toBe("para one\n\npara two\n\npara three");
});
it("strips soft hyphens (German line-broken word reassembly)", () => {
// German "Donau­dampf­schiff" line-broken with soft hyphens.
expect(normalize("Donau­dampf­schiff").text).toBe(
"Donaudampfschiff",
);
});
it("strips soft hyphens that span a newline ('word-\\nfragment' → 'wordfragment')", () => {
expect(normalize("word­\nfragment").text).toBe("wordfragment");
});
it("does not mangle ligatures (preserves the round-trip)", () => {
// The ligature "fi" (U+FB01) is left as-is — NFC does NOT decompose it.
// Test documents that current behavior so a future change is intentional.
expect(normalize("efficient").text).toBe("efficient");
});
it("handles a mixed-whitespace paragraph realistically", () => {
const input = " First line \r\n Second line.\r\n\r\n\r\nNext para. ";
expect(normalize(input).text).toBe("First line\nSecond line.\n\nNext para.");
});
it("returns an empty string for whitespace-only input", () => {
expect(normalize(" \n\n \t ").text).toBe("");
});
});

View File

@@ -0,0 +1,49 @@
// Canonical text normalization for selectors and stored quotes.
// Contract: wiki/SharedContracts.md §6.
//
// IMPORTANT: NORMALIZE_VERSION is stored on every Annotation. Bumping it is a
// migration event — old selectors must be re-resolved against re-normalized
// text before the new version becomes the default.
export const NORMALIZE_VERSION = 1;
// Soft hyphen (U+00AD), optionally followed by a single \n so that a PDF-
// extracted "word­\nfragment" reassembles to "wordfragment" rather than
// leaving a stray line break in the middle of a hyphenated word.
const SOFT_HYPHEN_AT_BREAK = /­\n?/g;
// Horizontal whitespace = any \s except \n and \r. The double-negation
// [^\S\r\n] is the idiomatic regex: \S is "not whitespace", so
// "not (not-whitespace or line-ending)" = "whitespace that is not a newline".
// Covers space, tab, NBSP, narrow NBSP, em-space, all Zs general-category.
const HORIZONTAL_WHITESPACE_RUN = /[^\S\r\n]+/g;
// 3+ newlines collapse to exactly two (one paragraph boundary).
const PARAGRAPH_RUN = /\n{3,}/g;
export function normalize(input: string): { text: string; version: number } {
// 1. Unicode NFC.
let text = input.normalize("NFC");
// 2. Normalize line endings: CRLF and CR -> LF.
text = text.replace(/\r\n?/g, "\n");
// 4. Strip soft hyphens (U+00AD) — including the line break that follows
// one — so PDF line-broken hyphenations reassemble. Done before
// horizontal collapse so no stray space remains.
text = text.replace(SOFT_HYPHEN_AT_BREAK, "");
// 3. Collapse horizontal whitespace runs to a single space.
text = text.replace(HORIZONTAL_WHITESPACE_RUN, " ");
// 5. Preserve paragraph boundaries (\n\n); collapse 3+ blank lines to 2.
text = text.replace(PARAGRAPH_RUN, "\n\n");
// Trim line-edge whitespace left over after horizontal collapse.
text = text.replace(/ +\n/g, "\n").replace(/\n +/g, "\n");
// Trim leading/trailing whitespace from the whole document.
text = text.trim();
return { text, version: NORMALIZE_VERSION };
}