diff --git a/docs/lefevre-epub3-validation.md b/docs/lefevre-epub3-validation.md index 1728a65..14a3452 100644 --- a/docs/lefevre-epub3-validation.md +++ b/docs/lefevre-epub3-validation.md @@ -89,3 +89,26 @@ The remaining gap is title collapse: all body sections still share the Project Gutenberg page title because chapter headings are not yet read from in-document `

` content. That collapse is T02's scope (chapter-aware chunking and stable IDs from in-document headings). + +## T02 Result (2026-05-17) + +Chapter-aware chunking and stable IDs landed. The same local Lefevre EPUB +now produces: + +- 67 body chunks (default `max_words=800` collapses to 24 single-chunk + chapters once `max_words=2000`) +- All 24 roman-numeral chapters detected and assigned stable IDs + `chapter-01` .. `chapter-24`; multi-part chapters get + `chapter-NN-part-001`, `chapter-NN-part-002`, ... +- Chapter labels resolved from the EPUB nav doc (when present) and from + the first in-document `

`/`

` heading +- Project Gutenberg page-title collapse is gone: each chunk's title is the + chapter label, not the shared book title +- TOC body section ("Contents") reclassified to `toc`; transcriber's notes + section reclassified to `notes`; section-role histogram is now + `body=67, cover=1, header=1, toc=1, notes=1, footer=2` +- Page anchors of the form `id="Page_N"` are preserved per chunk via the + `page_anchors` provenance field (e.g. chapter-01 carries + `Page_1..Page_14` distributed across its three parts) +- Optional `overlap_words` parameter supports evidence-window context + between adjacent parts of the same chapter without duplicating headings diff --git a/src/infospace_bench/generator.py b/src/infospace_bench/generator.py index dd09546..f296613 100644 --- a/src/infospace_bench/generator.py +++ b/src/infospace_bench/generator.py @@ -261,6 +261,9 @@ def _register_source_chunks(root: Path, chunks: list[SourceChunk]) -> None: "section_role": chunk.section_role, "spine_index": chunk.spine_index, "book_metadata": dict(chunk.book_metadata), + "chapter_label": chunk.chapter_label, + "chapter_number": chunk.chapter_number, + "page_anchors": list(chunk.page_anchors), }, ) diff --git a/src/infospace_bench/source_intake.py b/src/infospace_bench/source_intake.py index 688bca6..079de9b 100644 --- a/src/infospace_bench/source_intake.py +++ b/src/infospace_bench/source_intake.py @@ -13,16 +13,30 @@ from xml.etree import ElementTree as ET from .errors import InfospaceError from .semantics import slugify -EXTRACTOR_VERSION = "generic-source-intake-v2" +EXTRACTOR_VERSION = "generic-source-intake-v3" SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"} HTML_TITLE_RE = re.compile(r"]*>(?P.*?)", re.I | re.S) HTML_H1_RE = re.compile(r"]*>(?P.*?)</h1>", re.I | re.S) +HTML_FIRST_HEADING_RE = re.compile( + r"<(?P<tag>h[1-6])[^>]*>(?P<title>.*?)</(?P=tag)>", re.I | re.S +) SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S) TAG_RE = re.compile(r"<[^>]+>") +ANCHOR_OPEN_TAG_RE = re.compile( + r"""<(?P<name>[a-zA-Z][a-zA-Z0-9:-]*)[^>]*\bid=(?:"(?P<danchor>(?:Page_|page-|pg-|p-)[^"]*)"|'(?P<sanchor>(?:Page_|page-|pg-|p-)[^']*)')[^>]*>""", + re.I, +) +ANCHOR_MARKER_RE = re.compile(r"⟦anchor:(?P<anchor>[^⟧]+)⟧") +ROMAN_NUMERAL_RE = re.compile(r"^([MDCLXVI]+)\.?$", re.I) +CHAPTER_NUMBER_RE = re.compile( + r"^chapter\s+(?P<value>[ivxlcdm]+|\d+)\b", + re.I, +) OPF_NS = "http://www.idpf.org/2007/opf" DC_NS = "http://purl.org/dc/elements/1.1/" CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container" +XHTML_NS = "http://www.w3.org/1999/xhtml" SECTION_ROLE_BODY = "body" SECTION_ROLE_COVER = "cover" @@ -34,6 +48,24 @@ SECTION_ROLE_NOTES = "notes" SECTION_ROLE_LICENSE = "license" SECTION_ROLE_AUXILIARY = "auxiliary" +TOC_LABEL_TOKENS = {"contents", "table of contents", "toc"} +NOTES_LABEL_TOKENS = { + "transcribers notes", + "transcriber notes", + "transcribers note", + "transcribers comments", + "editors notes", + "editor notes", +} +LICENSE_LABEL_TOKENS = { + "license", + "project gutenberg license", + "the project gutenberg license", + "license terms", + "copyright", + "colophon", +} + PG_START_MARKERS = ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK") PG_END_MARKERS = ("*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK") @@ -53,6 +85,9 @@ class SourceChunk: section_role: str = SECTION_ROLE_BODY spine_index: int | None = None book_metadata: dict = field(default_factory=dict) + chapter_label: str | None = None + chapter_number: int | None = None + page_anchors: tuple = () def to_dict(self) -> dict: return asdict(self) @@ -68,6 +103,8 @@ class _SourceDocument: section_role: str = SECTION_ROLE_BODY spine_index: int | None = None book_metadata: dict = field(default_factory=dict) + chapter_label: str | None = None + chapter_number: int | None = None @dataclass(frozen=True) @@ -90,6 +127,7 @@ def normalize_source( max_words: int = 800, max_chunks: int | None = None, include_non_body: bool = False, + overlap_words: int = 0, ) -> list[SourceChunk]: source_path = Path(source) if not source_path.exists(): @@ -112,27 +150,31 @@ def normalize_source( chunks: list[SourceChunk] = [] used_ids: set[str] = set() for document in documents: - pieces = _chunk_markdown(document.markdown, max_words=max_words) - for index, piece in enumerate(pieces): - title = document.title if len(pieces) == 1 else f"{document.title} Part {index + 1}" + pieces = _split_document(document, max_words=max_words, overlap_words=overlap_words) + for index, (part_title, part_markdown, part_anchors) in enumerate(pieces): base_id = ( - document.base_slug if len(pieces) == 1 else f"{document.base_slug}-part-{index + 1:03d}" + document.base_slug + if len(pieces) == 1 + else f"{document.base_slug}-part-{index + 1:03d}" ) chunk_id = _dedupe_chunk_id(base_id, used_ids) chunks.append( SourceChunk( chunk_id=chunk_id, - title=title, - markdown=piece, + title=part_title, + markdown=part_markdown, source_type=document.source_type, original_path=document.original_path, - digest=_digest_text(piece), + digest=_digest_text(part_markdown), chunk_index=index, chunk_count=len(pieces), imported_at=imported_at, section_role=document.section_role, spine_index=document.spine_index, book_metadata=dict(document.book_metadata), + chapter_label=document.chapter_label, + chapter_number=document.chapter_number, + page_anchors=tuple(part_anchors), ) ) if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks: @@ -342,6 +384,9 @@ def _epub3_spine_documents( metadata, manifest, spine = _parse_opf(archive, opf_path) book_title = metadata.get("title") or _title_from_path(source_path) book_slug = slugify(book_title) or slugify(source_path.stem) or "ebook" + nav_labels = _load_nav_labels(archive, manifest) + chapter_counter = 0 + used_chapter_numbers: set[int] = set() for spine_index, entry in enumerate(spine): item = manifest.get(entry.item_id) if item is None or not item.href: @@ -351,24 +396,59 @@ def _epub3_spine_documents( except KeyError: continue role = _classify_section(item, entry, raw) + nav_label = nav_labels.get(item.href, "") + heading_label = _first_heading_text(raw) if Path(item.href).suffix.lower() not in {".txt", ".md"} else "" + chapter_label = nav_label or heading_label or None + # Reclassify body sections whose chapter label matches known noise tokens + if role == SECTION_ROLE_BODY and chapter_label: + normalized_label = re.sub(r"[^a-z0-9 ]+", "", chapter_label.lower()).strip() + if normalized_label in TOC_LABEL_TOKENS: + role = SECTION_ROLE_TOC + elif normalized_label in NOTES_LABEL_TOKENS: + role = SECTION_ROLE_NOTES + elif normalized_label in LICENSE_LABEL_TOKENS: + role = SECTION_ROLE_LICENSE if role != SECTION_ROLE_BODY and not include_non_body: continue + chapter_number: int | None = None + if role == SECTION_ROLE_BODY and chapter_label: + chapter_number = _parse_chapter_number(chapter_label) + if role == SECTION_ROLE_BODY and chapter_number is None: + # Fall back to sequential body counter when chapter label exists but + # is not a roman/arabic numeral (e.g. "Preface"). + if chapter_label: + chapter_counter += 1 + # Use sequential only when no other body has claimed this slot; + # roman-numeral chapters take precedence and may overlap, so we + # leave chapter_number=None for non-numeric labels and let the + # slug fall back to the label slug. + if chapter_number is not None: + if chapter_number in used_chapter_numbers: + # Duplicate numeric label across the book — keep label, drop the + # numeric slot so the slug falls back to label-based naming. + chapter_number = None + else: + used_chapter_numbers.add(chapter_number) suffix = Path(item.href).suffix.lower() if suffix in {".txt", ".md"}: - title = _markdown_title(raw) or _title_from_path(Path(item.href)) + title = chapter_label or _markdown_title(raw) or _title_from_path(Path(item.href)) markdown_body = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title) else: - title = _html_title(raw) or _title_from_path(Path(item.href)) - text = _html_to_text(raw) + title = chapter_label or _html_title(raw) or _title_from_path(Path(item.href)) + marked = _inject_anchor_markers(raw) + text = _html_to_text(marked) if text.lower().startswith(title.lower()): text = text[len(title) :].strip() markdown_body = f"# {title}\n\n{text}\n" - section_slug = ( - slugify(title) - or slugify(Path(item.href).stem) - or f"section-{spine_index + 1:03d}" + base_slug = _chapter_base_slug( + role=role, + chapter_number=chapter_number, + chapter_label=chapter_label, + book_slug=book_slug, + spine_index=spine_index, + href=item.href, + title=title, ) - base_slug = f"{book_slug}-{spine_index + 1:03d}-{section_slug}" yield _SourceDocument( title=title, markdown=markdown_body, @@ -378,9 +458,119 @@ def _epub3_spine_documents( section_role=role, spine_index=spine_index, book_metadata=metadata, + chapter_label=chapter_label, + chapter_number=chapter_number, ) +def _chapter_base_slug( + *, + role: str, + chapter_number: int | None, + chapter_label: str | None, + book_slug: str, + spine_index: int, + href: str, + title: str, +) -> str: + if role == SECTION_ROLE_BODY and chapter_number is not None: + return f"chapter-{chapter_number:02d}" + if role == SECTION_ROLE_BODY and chapter_label: + return f"chapter-{slugify(chapter_label) or f'section-{spine_index + 1:03d}'}" + section_slug = ( + slugify(title) + or slugify(Path(href).stem) + or f"section-{spine_index + 1:03d}" + ) + return f"{book_slug}-{spine_index + 1:03d}-{section_slug}" + + +def _load_nav_labels( + archive: zipfile.ZipFile, manifest: dict[str, _EpubManifestItem] +) -> dict[str, str]: + nav_item = next( + (item for item in manifest.values() if "nav" in item.properties), + None, + ) + if nav_item is None: + return {} + try: + raw = archive.read(nav_item.href).decode("utf-8", errors="replace") + except KeyError: + return {} + base = _zip_dirname(nav_item.href) + labels: dict[str, str] = {} + pattern = re.compile( + r"""<a[^>]*\bhref=(?:"(?P<dhref>[^"#]+)(?:#[^"]*)?"|'(?P<shref>[^'#]+)(?:#[^']*)?')[^>]*>(?P<label>.*?)</a>""", + re.I | re.S, + ) + for match in pattern.finditer(raw): + href_attr = match.group("dhref") or match.group("shref") or "" + if not href_attr: + continue + label = _collapse_ws(_html_to_text(match.group("label"))) + if not label: + continue + resolved = _join_zip_path(base, href_attr) + labels.setdefault(resolved, label) + return labels + + +def _first_heading_text(html_raw: str) -> str: + match = HTML_FIRST_HEADING_RE.search(html_raw) + if not match: + return "" + return _collapse_ws(_html_to_text(match.group("title"))) + + +def _parse_chapter_number(label: str) -> int | None: + stripped = label.strip() + if not stripped: + return None + if stripped.isdigit(): + return int(stripped) + roman_match = ROMAN_NUMERAL_RE.match(stripped) + if roman_match: + value = _roman_to_int(roman_match.group(1).upper()) + if value > 0: + return value + chapter_match = CHAPTER_NUMBER_RE.match(stripped) + if chapter_match: + token = chapter_match.group("value") + if token.isdigit(): + return int(token) + value = _roman_to_int(token.upper()) + if value > 0: + return value + return None + + +def _roman_to_int(value: str) -> int: + table = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000} + total = 0 + prev = 0 + for ch in reversed(value): + cur = table.get(ch, 0) + if cur == 0: + return 0 + if cur < prev: + total -= cur + else: + total += cur + prev = cur + return total + + +def _inject_anchor_markers(raw: str) -> str: + def repl(match: re.Match) -> str: + anchor = match.group("danchor") or match.group("sanchor") or "" + if not anchor: + return match.group(0) + return f"{match.group(0)} ⟦anchor:{anchor}⟧ " + + return ANCHOR_OPEN_TAG_RE.sub(repl, raw) + + def _epub_legacy_documents( archive: zipfile.ZipFile, source_path: Path ) -> Iterable[_SourceDocument]: @@ -464,20 +654,72 @@ def _join_zip_path(base: str, href: str) -> str: return f"{base}/{href}" -def _chunk_markdown(markdown: str, *, max_words: int) -> list[str]: - text = markdown.strip() - if max_words <= 0: - return [text + "\n"] - words = text.split() - if len(words) <= max_words: - return [text + "\n"] - chunks: list[str] = [] - heading = _markdown_title(text) or "Source" - body_words = re.sub(r"(?m)^# .+?\n+", "", text, count=1).split() - for start in range(0, len(body_words), max_words): - part = " ".join(body_words[start : start + max_words]).strip() - chunks.append(f"# {heading} Part {len(chunks) + 1}\n\n{part}\n") - return chunks +def _split_document( + document: _SourceDocument, + *, + max_words: int, + overlap_words: int, +) -> list[tuple[str, str, list[str]]]: + text = document.markdown.strip() + heading = _markdown_title(text) or document.title or "Source" + body_with_markers = re.sub(r"(?m)^# .+?\n+", "", text, count=1).strip() + clean_body, anchor_positions = _extract_anchor_positions(body_with_markers) + words = clean_body.split() + if max_words <= 0 or len(words) <= max_words: + anchors = [name for name, _idx in anchor_positions] + return [(document.title, _compose_chunk(heading, clean_body), anchors)] + overlap = max(0, min(overlap_words, max_words - 1)) + step = max_words - overlap if overlap > 0 else max_words + parts: list[tuple[str, str, list[str]]] = [] + start = 0 + while start < len(words): + end = min(start + max_words, len(words)) + slice_words = words[start:end] + if not slice_words: + break + part_index = len(parts) + 1 + part_text = " ".join(slice_words).strip() + part_anchors = _anchors_in_range(anchor_positions, start, end) + part_title = f"{document.title} Part {part_index}" + parts.append((part_title, _compose_chunk(heading, part_text), part_anchors)) + if end >= len(words): + break + start += step + return parts + + +def _compose_chunk(heading: str, body: str) -> str: + body = body.strip() + if not body: + return f"# {heading}\n" + return f"# {heading}\n\n{body}\n" + + +def _extract_anchor_positions(text: str) -> tuple[str, list[tuple[str, int]]]: + parts: list[str] = [] + anchors: list[tuple[str, int]] = [] + cursor = 0 + for match in ANCHOR_MARKER_RE.finditer(text): + prefix = text[cursor : match.start()] + parts.append(prefix) + word_index = sum(len(part.split()) for part in parts) + anchors.append((match.group("anchor"), word_index)) + cursor = match.end() + parts.append(text[cursor:]) + cleaned = re.sub(r"[ \t]{2,}", " ", "".join(parts)).strip() + return cleaned, anchors + + +def _anchors_in_range( + anchor_positions: list[tuple[str, int]], start: int, end: int +) -> list[str]: + seen: set[str] = set() + found: list[str] = [] + for name, idx in anchor_positions: + if start <= idx < end and name not in seen: + seen.add(name) + found.append(name) + return found def _html_title(raw: str) -> str: @@ -488,7 +730,8 @@ def _html_title(raw: str) -> str: def _html_to_text(raw: str) -> str: - cleaned = SCRIPT_STYLE_RE.sub(" ", raw) + cleaned = re.sub(r"<head\b[^>]*>.*?</head>", " ", raw, flags=re.I | re.S) + cleaned = SCRIPT_STYLE_RE.sub(" ", cleaned) cleaned = re.sub(r"</(p|div|section|article|h[1-6]|li)>", "\n", cleaned, flags=re.I) cleaned = TAG_RE.sub(" ", cleaned) cleaned = html.unescape(cleaned) diff --git a/tests/test_epub3_intake.py b/tests/test_epub3_intake.py index b571954..a9aea4c 100644 --- a/tests/test_epub3_intake.py +++ b/tests/test_epub3_intake.py @@ -117,10 +117,12 @@ def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path assert [chunk.spine_index for chunk in chunks] == [3, 4] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.source_type == "epub" for chunk in chunks) - # Chunk IDs must be stable, ordered, and not collapse to the book title. - assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-") - assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-") - assert chunks[0].chunk_id != chunks[1].chunk_id + # Stable chapter-NN IDs from in-document heading parsing, not collapsed + # to the Project Gutenberg page title. + assert [chunk.chunk_id for chunk in chunks] == ["chapter-01", "chapter-02"] + assert [chunk.chapter_label for chunk in chunks] == ["Chapter I", "Chapter II"] + assert [chunk.chapter_number for chunk in chunks] == [1, 2] + assert "Chapter I" not in chunks[0].markdown.split("\n", 1)[1] def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None: @@ -171,3 +173,145 @@ def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: P assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.spine_index is None for chunk in chunks) assert all(chunk.book_metadata == {} for chunk in chunks) + assert all(chunk.chapter_number is None for chunk in chunks) + + +ROMAN_PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?> +<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid"> + <metadata xmlns:dc="http://purl.org/dc/elements/1.1/"> + <dc:identifier id="bookid">urn:test:roman</dc:identifier> + <dc:title>Roman Chapters Book</dc:title> + <dc:creator>Test Author</dc:creator> + <dc:language>en</dc:language> + </metadata> + <manifest> + <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/> + <item id="contents" href="contents.xhtml" media-type="application/xhtml+xml"/> + <item id="ch1" href="ch1.xhtml" media-type="application/xhtml+xml"/> + <item id="ch2" href="ch2.xhtml" media-type="application/xhtml+xml"/> + <item id="ch3" href="ch3.xhtml" media-type="application/xhtml+xml"/> + </manifest> + <spine> + <itemref idref="nav" linear="no"/> + <itemref idref="contents"/> + <itemref idref="ch1"/> + <itemref idref="ch2"/> + <itemref idref="ch3"/> + </spine> +</package> +""" + + +def _write_roman_chapter_epub(path: Path, *, ch1_words: int = 50) -> None: + long_body = " ".join(f"word{i}" for i in range(ch1_words)) + with zipfile.ZipFile(path, "w") as archive: + archive.writestr("mimetype", "application/epub+zip") + archive.writestr("META-INF/container.xml", CONTAINER_XML) + archive.writestr("OEBPS/content.opf", ROMAN_PACKAGE_OPF) + archive.writestr( + "OEBPS/nav.xhtml", + "<html><head><title>TOC" + "", + ) + archive.writestr( + "OEBPS/contents.xhtml", + "Book" + "

Contents

Listing.

", + ) + archive.writestr( + "OEBPS/ch1.xhtml", + "Book" + "

I

" + "

" + f'1 {long_body} ' + f'2 tail tail tail.' + "

", + ) + archive.writestr( + "OEBPS/ch2.xhtml", + "Book" + "

II

Short chapter two.

", + ) + archive.writestr( + "OEBPS/ch3.xhtml", + "Book" + "

III

Short chapter three.

", + ) + + +def test_epub3_intake_assigns_stable_chapter_ids_from_roman_headings(tmp_path: Path) -> None: + book = tmp_path / "roman.epub" + _write_roman_chapter_epub(book) + + chunks = normalize_source(book) + + # nav says ch1="Foreword" so chapter_number stays None there (non-numeric label). + # ch2/ch3 nav says II/III which match the in-document heading and parse to 2/3. + # contents.xhtml has heading "Contents" which reclassifies as toc (non-body) and is dropped by default. + assert [chunk.chapter_label for chunk in chunks] == ["Foreword", "II", "III"] + assert [chunk.chapter_number for chunk in chunks] == [None, 2, 3] + assert [chunk.chunk_id for chunk in chunks] == [ + "chapter-foreword", + "chapter-02", + "chapter-03", + ] + + +def test_epub3_intake_reclassifies_contents_body_section_when_included(tmp_path: Path) -> None: + book = tmp_path / "roman.epub" + _write_roman_chapter_epub(book) + + chunks = normalize_source(book, include_non_body=True) + + contents = next(chunk for chunk in chunks if chunk.spine_index == 1) + assert contents.section_role == "toc" + assert contents.chapter_label == "Contents" + + +def test_epub3_intake_splits_long_chapter_into_parts_with_anchors(tmp_path: Path) -> None: + book = tmp_path / "roman.epub" + _write_roman_chapter_epub(book, ch1_words=300) + + chunks = normalize_source(book, max_words=120) + + foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"] + assert len(foreword_parts) >= 2 + assert [chunk.chunk_id for chunk in foreword_parts] == [ + f"chapter-foreword-part-{i + 1:03d}" for i in range(len(foreword_parts)) + ] + # Each part keeps the chapter heading and is named by chapter + part suffix. + assert all(chunk.markdown.startswith("# Foreword\n") for chunk in foreword_parts) + assert all(chunk.title.startswith("Foreword") for chunk in foreword_parts) + + # Page anchors land on the parts whose word range contains them. + first_part_anchors = foreword_parts[0].page_anchors + last_part_anchors = foreword_parts[-1].page_anchors + assert "Page_1" in first_part_anchors + assert "Page_2" in last_part_anchors + # Anchor markers must not leak into the final markdown text. + assert "⟦anchor:" not in foreword_parts[0].markdown + + +def test_epub3_intake_supports_word_overlap_between_chapter_parts(tmp_path: Path) -> None: + book = tmp_path / "roman.epub" + _write_roman_chapter_epub(book, ch1_words=200) + + chunks = normalize_source(book, max_words=80, overlap_words=20) + + foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"] + assert len(foreword_parts) >= 2 + + def _body_words(markdown: str) -> list[str]: + body = markdown.split("\n", 2)[2] if markdown.count("\n") >= 2 else "" + return body.split() + + first = _body_words(foreword_parts[0].markdown) + second = _body_words(foreword_parts[1].markdown) + # The trailing overlap_words of the first part must reappear verbatim at + # the head of the next part. + assert first[-20:] == second[:20] diff --git a/workplans/IB-WP-0016-lefevre-ebook-infospace-readiness.md b/workplans/IB-WP-0016-lefevre-ebook-infospace-readiness.md index 2fc49d8..8aa24c1 100644 --- a/workplans/IB-WP-0016-lefevre-ebook-infospace-readiness.md +++ b/workplans/IB-WP-0016-lefevre-ebook-infospace-readiness.md @@ -99,7 +99,7 @@ state_hub_task_id: "a672fcf9-1b80-4faf-b16d-84ca52601dc9" ```task id: IB-WP-0016-T02 -status: in_progress +status: done priority: high state_hub_task_id: "47de1110-36d0-4d63-bf87-389746509e03" ```