IB-WP-0016-T02: chapter-aware chunking and stable IDs

Resolve chapter labels from EPUB nav entries (when present) and from the first in-document h1/h2/h3 heading, parse roman-numeral and "Chapter N" labels into numeric chapter indices, and generate stable IDs of the form chapter-NN with -part-NNN suffix when a chapter exceeds max_words. The chunker now operates on cleaned body text, distributes id="Page_*" page anchors per part via inline markers extracted before splitting, and supports a configurable overlap_words evidence window between adjacent parts of the same chapter. Reclassify body sections whose chapter label matches contents/transcriber-notes/license/colophon tokens so they leave the body stream by default. Strip <head>...</head> from HTML body extraction to stop the <title> tag from duplicating heading text in the chunk markdown. Real Lefevre EPUB now detects all 24 roman-numeral chapters with stable chapter-NN IDs, distributes Page_N anchors across multi-part chapters, and reclassifies Contents and Transcriber's Notes out of body (role histogram body=67, cover=1, header=1, toc=1, notes=1, footer=2). 82 tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 15:52:47 +02:00
parent ef19aa6de7
commit b9173b6569
5 changed files with 449 additions and 36 deletions
--- a/tests/test_epub3_intake.py
+++ b/tests/test_epub3_intake.py
@@ -117,10 +117,12 @@ def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path
    assert [chunk.spine_index for chunk in chunks] == [3, 4]
    assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
    assert all(chunk.source_type == "epub" for chunk in chunks)
-    # Chunk IDs must be stable, ordered, and not collapse to the book title.
-    assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-")
-    assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-")
-    assert chunks[0].chunk_id != chunks[1].chunk_id
+    # Stable chapter-NN IDs from in-document heading parsing, not collapsed
+    # to the Project Gutenberg page title.
+    assert [chunk.chunk_id for chunk in chunks] == ["chapter-01", "chapter-02"]
+    assert [chunk.chapter_label for chunk in chunks] == ["Chapter I", "Chapter II"]
+    assert [chunk.chapter_number for chunk in chunks] == [1, 2]
+    assert "Chapter I" not in chunks[0].markdown.split("\n", 1)[1]


 def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None:
@@ -171,3 +173,145 @@ def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: P
    assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
    assert all(chunk.spine_index is None for chunk in chunks)
    assert all(chunk.book_metadata == {} for chunk in chunks)
+    assert all(chunk.chapter_number is None for chunk in chunks)
+
+
+ROMAN_PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?>
+<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
+  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
+    <dc:identifier id="bookid">urn:test:roman</dc:identifier>
+    <dc:title>Roman Chapters Book</dc:title>
+    <dc:creator>Test Author</dc:creator>
+    <dc:language>en</dc:language>
+  </metadata>
+  <manifest>
+    <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
+    <item id="contents" href="contents.xhtml" media-type="application/xhtml+xml"/>
+    <item id="ch1" href="ch1.xhtml" media-type="application/xhtml+xml"/>
+    <item id="ch2" href="ch2.xhtml" media-type="application/xhtml+xml"/>
+    <item id="ch3" href="ch3.xhtml" media-type="application/xhtml+xml"/>
+  </manifest>
+  <spine>
+    <itemref idref="nav" linear="no"/>
+    <itemref idref="contents"/>
+    <itemref idref="ch1"/>
+    <itemref idref="ch2"/>
+    <itemref idref="ch3"/>
+  </spine>
+</package>
+"""
+
+
+def _write_roman_chapter_epub(path: Path, *, ch1_words: int = 50) -> None:
+    long_body = " ".join(f"word{i}" for i in range(ch1_words))
+    with zipfile.ZipFile(path, "w") as archive:
+        archive.writestr("mimetype", "application/epub+zip")
+        archive.writestr("META-INF/container.xml", CONTAINER_XML)
+        archive.writestr("OEBPS/content.opf", ROMAN_PACKAGE_OPF)
+        archive.writestr(
+            "OEBPS/nav.xhtml",
+            "<html><head><title>TOC</title></head><body>"
+            "<nav epub:type='toc'><ol>"
+            "<li><a href='contents.xhtml'>Contents</a></li>"
+            "<li><a href='ch1.xhtml'>Foreword</a></li>"
+            "<li><a href='ch2.xhtml'>II</a></li>"
+            "<li><a href='ch3.xhtml'>III</a></li>"
+            "</ol></nav></body></html>",
+        )
+        archive.writestr(
+            "OEBPS/contents.xhtml",
+            "<html><head><title>Book</title></head>"
+            "<body><h2>Contents</h2><p>Listing.</p></body></html>",
+        )
+        archive.writestr(
+            "OEBPS/ch1.xhtml",
+            "<html><head><title>Book</title></head>"
+            "<body><h2>I</h2>"
+            "<p>"
+            f'<span id="Page_1">1</span> {long_body} '
+            f'<span id="Page_2">2</span> tail tail tail.'
+            "</p></body></html>",
+        )
+        archive.writestr(
+            "OEBPS/ch2.xhtml",
+            "<html><head><title>Book</title></head>"
+            "<body><h2>II</h2><p>Short chapter two.</p></body></html>",
+        )
+        archive.writestr(
+            "OEBPS/ch3.xhtml",
+            "<html><head><title>Book</title></head>"
+            "<body><h2>III</h2><p>Short chapter three.</p></body></html>",
+        )
+
+
+def test_epub3_intake_assigns_stable_chapter_ids_from_roman_headings(tmp_path: Path) -> None:
+    book = tmp_path / "roman.epub"
+    _write_roman_chapter_epub(book)
+
+    chunks = normalize_source(book)
+
+    # nav says ch1="Foreword" so chapter_number stays None there (non-numeric label).
+    # ch2/ch3 nav says II/III which match the in-document heading and parse to 2/3.
+    # contents.xhtml has heading "Contents" which reclassifies as toc (non-body) and is dropped by default.
+    assert [chunk.chapter_label for chunk in chunks] == ["Foreword", "II", "III"]
+    assert [chunk.chapter_number for chunk in chunks] == [None, 2, 3]
+    assert [chunk.chunk_id for chunk in chunks] == [
+        "chapter-foreword",
+        "chapter-02",
+        "chapter-03",
+    ]
+
+
+def test_epub3_intake_reclassifies_contents_body_section_when_included(tmp_path: Path) -> None:
+    book = tmp_path / "roman.epub"
+    _write_roman_chapter_epub(book)
+
+    chunks = normalize_source(book, include_non_body=True)
+
+    contents = next(chunk for chunk in chunks if chunk.spine_index == 1)
+    assert contents.section_role == "toc"
+    assert contents.chapter_label == "Contents"
+
+
+def test_epub3_intake_splits_long_chapter_into_parts_with_anchors(tmp_path: Path) -> None:
+    book = tmp_path / "roman.epub"
+    _write_roman_chapter_epub(book, ch1_words=300)
+
+    chunks = normalize_source(book, max_words=120)
+
+    foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"]
+    assert len(foreword_parts) >= 2
+    assert [chunk.chunk_id for chunk in foreword_parts] == [
+        f"chapter-foreword-part-{i + 1:03d}" for i in range(len(foreword_parts))
+    ]
+    # Each part keeps the chapter heading and is named by chapter + part suffix.
+    assert all(chunk.markdown.startswith("# Foreword\n") for chunk in foreword_parts)
+    assert all(chunk.title.startswith("Foreword") for chunk in foreword_parts)
+
+    # Page anchors land on the parts whose word range contains them.
+    first_part_anchors = foreword_parts[0].page_anchors
+    last_part_anchors = foreword_parts[-1].page_anchors
+    assert "Page_1" in first_part_anchors
+    assert "Page_2" in last_part_anchors
+    # Anchor markers must not leak into the final markdown text.
+    assert "⟦anchor:" not in foreword_parts[0].markdown
+
+
+def test_epub3_intake_supports_word_overlap_between_chapter_parts(tmp_path: Path) -> None:
+    book = tmp_path / "roman.epub"
+    _write_roman_chapter_epub(book, ch1_words=200)
+
+    chunks = normalize_source(book, max_words=80, overlap_words=20)
+
+    foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"]
+    assert len(foreword_parts) >= 2
+
+    def _body_words(markdown: str) -> list[str]:
+        body = markdown.split("\n", 2)[2] if markdown.count("\n") >= 2 else ""
+        return body.split()
+
+    first = _body_words(foreword_parts[0].markdown)
+    second = _body_words(foreword_parts[1].markdown)
+    # The trailing overlap_words of the first part must reappear verbatim at
+    # the head of the next part.
+    assert first[-20:] == second[:20]