generated from coulomb/repo-seed
Implement CE-WP-0001 Foundations: TS scaffold, lint boundaries, normalize v1, fixtures
T01 Toolchain — vite + pnpm 9.15 + React 18 + strict TS (ADR-0001).
T02 Folder layout — src/{shared,engine,anchor,source,binder,work,app}/
mirroring the future subsystem split, with path aliases.
T03 Boundary lint — eslint-plugin-boundaries enforcing the dependency
edges from wiki/DependencyMap.md §4; verified by a violating fixture.
T04 Canonical normalization v1 — src/shared/text/normalize.ts with
NORMALIZE_VERSION=1; 10/10 vitest covering ligatures, CRLF, soft
hyphens (including line-break reassembly), mixed whitespace.
T05 PDF fixture corpus — 7 user-supplied German PDFs in fixtures/pdfs/
(gitignored binaries) plus a manifest with verbatim known-good
quotes and page counts, ready for CE-WP-0002 selector tests.
T06 README upgrade — umbrella README points at wiki/docs/workplans
and documents the dev workflow.
T07 ADR-0002..0006 stubs in docs/decisions/.
Toolchain end-to-end: pnpm install + lint + typecheck + test all green.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
8
src/shared/README.md
Normal file
8
src/shared/README.md
Normal file
@@ -0,0 +1,8 @@
|
||||
# `src/shared/` — vocabulary, types, pure helpers
|
||||
|
||||
Future home: `citation-engine` (the shared types and contracts half of it).
|
||||
Owns: `Document`, `Selector`, `Annotation`, `EvidenceItem`, `EvidenceLink`,
|
||||
`EvidenceSet`, state enums, branded IDs, canonical text normalization.
|
||||
|
||||
May import from: nothing internal. Leaf node of the dependency graph
|
||||
(`wiki/DependencyMap.md` §4).
|
||||
1
src/shared/index.ts
Normal file
1
src/shared/index.ts
Normal file
@@ -0,0 +1 @@
|
||||
export {};
|
||||
56
src/shared/text/normalize.test.ts
Normal file
56
src/shared/text/normalize.test.ts
Normal file
@@ -0,0 +1,56 @@
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { NORMALIZE_VERSION, normalize } from "./normalize.js";
|
||||
|
||||
describe("normalize (NORMALIZE_VERSION=1)", () => {
|
||||
it("returns the version constant alongside the text", () => {
|
||||
const out = normalize("hello");
|
||||
expect(out.version).toBe(NORMALIZE_VERSION);
|
||||
expect(out.text).toBe("hello");
|
||||
});
|
||||
|
||||
it("applies Unicode NFC composition", () => {
|
||||
// "é" decomposed (e + combining acute) vs precomposed.
|
||||
const decomposed = "café";
|
||||
const precomposed = "café";
|
||||
expect(normalize(decomposed).text).toBe(precomposed);
|
||||
});
|
||||
|
||||
it("normalizes CRLF and CR line endings to LF", () => {
|
||||
expect(normalize("a\r\nb\rc").text).toBe("a\nb\nc");
|
||||
});
|
||||
|
||||
it("collapses horizontal whitespace runs to a single space", () => {
|
||||
expect(normalize("a b\t\tc d").text).toBe("a b c d");
|
||||
});
|
||||
|
||||
it("preserves paragraph boundaries but collapses 3+ blank lines to one", () => {
|
||||
const input = "para one\n\n\n\npara two\n\npara three";
|
||||
expect(normalize(input).text).toBe("para one\n\npara two\n\npara three");
|
||||
});
|
||||
|
||||
it("strips soft hyphens (German line-broken word reassembly)", () => {
|
||||
// German "Donaudampfschiff" line-broken with soft hyphens.
|
||||
expect(normalize("Donaudampfschiff").text).toBe(
|
||||
"Donaudampfschiff",
|
||||
);
|
||||
});
|
||||
|
||||
it("strips soft hyphens that span a newline ('word-\\nfragment' → 'wordfragment')", () => {
|
||||
expect(normalize("word\nfragment").text).toBe("wordfragment");
|
||||
});
|
||||
|
||||
it("does not mangle ligatures (preserves the round-trip)", () => {
|
||||
// The ligature "fi" (U+FB01) is left as-is — NFC does NOT decompose it.
|
||||
// Test documents that current behavior so a future change is intentional.
|
||||
expect(normalize("efficient").text).toBe("efficient");
|
||||
});
|
||||
|
||||
it("handles a mixed-whitespace paragraph realistically", () => {
|
||||
const input = " First line \r\n Second line.\r\n\r\n\r\nNext para. ";
|
||||
expect(normalize(input).text).toBe("First line\nSecond line.\n\nNext para.");
|
||||
});
|
||||
|
||||
it("returns an empty string for whitespace-only input", () => {
|
||||
expect(normalize(" \n\n \t ").text).toBe("");
|
||||
});
|
||||
});
|
||||
49
src/shared/text/normalize.ts
Normal file
49
src/shared/text/normalize.ts
Normal file
@@ -0,0 +1,49 @@
|
||||
// Canonical text normalization for selectors and stored quotes.
|
||||
// Contract: wiki/SharedContracts.md §6.
|
||||
//
|
||||
// IMPORTANT: NORMALIZE_VERSION is stored on every Annotation. Bumping it is a
|
||||
// migration event — old selectors must be re-resolved against re-normalized
|
||||
// text before the new version becomes the default.
|
||||
|
||||
export const NORMALIZE_VERSION = 1;
|
||||
|
||||
// Soft hyphen (U+00AD), optionally followed by a single \n so that a PDF-
|
||||
// extracted "word\nfragment" reassembles to "wordfragment" rather than
|
||||
// leaving a stray line break in the middle of a hyphenated word.
|
||||
const SOFT_HYPHEN_AT_BREAK = /\n?/g;
|
||||
|
||||
// Horizontal whitespace = any \s except \n and \r. The double-negation
|
||||
// [^\S\r\n] is the idiomatic regex: \S is "not whitespace", so
|
||||
// "not (not-whitespace or line-ending)" = "whitespace that is not a newline".
|
||||
// Covers space, tab, NBSP, narrow NBSP, em-space, all Zs general-category.
|
||||
const HORIZONTAL_WHITESPACE_RUN = /[^\S\r\n]+/g;
|
||||
|
||||
// 3+ newlines collapse to exactly two (one paragraph boundary).
|
||||
const PARAGRAPH_RUN = /\n{3,}/g;
|
||||
|
||||
export function normalize(input: string): { text: string; version: number } {
|
||||
// 1. Unicode NFC.
|
||||
let text = input.normalize("NFC");
|
||||
|
||||
// 2. Normalize line endings: CRLF and CR -> LF.
|
||||
text = text.replace(/\r\n?/g, "\n");
|
||||
|
||||
// 4. Strip soft hyphens (U+00AD) — including the line break that follows
|
||||
// one — so PDF line-broken hyphenations reassemble. Done before
|
||||
// horizontal collapse so no stray space remains.
|
||||
text = text.replace(SOFT_HYPHEN_AT_BREAK, "");
|
||||
|
||||
// 3. Collapse horizontal whitespace runs to a single space.
|
||||
text = text.replace(HORIZONTAL_WHITESPACE_RUN, " ");
|
||||
|
||||
// 5. Preserve paragraph boundaries (\n\n); collapse 3+ blank lines to 2.
|
||||
text = text.replace(PARAGRAPH_RUN, "\n\n");
|
||||
|
||||
// Trim line-edge whitespace left over after horizontal collapse.
|
||||
text = text.replace(/ +\n/g, "\n").replace(/\n +/g, "\n");
|
||||
|
||||
// Trim leading/trailing whitespace from the whole document.
|
||||
text = text.trim();
|
||||
|
||||
return { text, version: NORMALIZE_VERSION };
|
||||
}
|
||||
Reference in New Issue
Block a user