generated from coulomb/repo-seed
IB-WP-0016-T02: chapter-aware chunking and stable IDs
Resolve chapter labels from EPUB nav entries (when present) and from the first in-document h1/h2/h3 heading, parse roman-numeral and "Chapter N" labels into numeric chapter indices, and generate stable IDs of the form chapter-NN with -part-NNN suffix when a chapter exceeds max_words. The chunker now operates on cleaned body text, distributes id="Page_*" page anchors per part via inline markers extracted before splitting, and supports a configurable overlap_words evidence window between adjacent parts of the same chapter. Reclassify body sections whose chapter label matches contents/transcriber-notes/license/colophon tokens so they leave the body stream by default. Strip <head>...</head> from HTML body extraction to stop the <title> tag from duplicating heading text in the chunk markdown. Real Lefevre EPUB now detects all 24 roman-numeral chapters with stable chapter-NN IDs, distributes Page_N anchors across multi-part chapters, and reclassifies Contents and Transcriber's Notes out of body (role histogram body=67, cover=1, header=1, toc=1, notes=1, footer=2). 82 tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -89,3 +89,26 @@ The remaining gap is title collapse: all body sections still share the
|
||||
Project Gutenberg page title because chapter headings are not yet read from
|
||||
in-document `<h1>` content. That collapse is T02's scope (chapter-aware
|
||||
chunking and stable IDs from in-document headings).
|
||||
|
||||
## T02 Result (2026-05-17)
|
||||
|
||||
Chapter-aware chunking and stable IDs landed. The same local Lefevre EPUB
|
||||
now produces:
|
||||
|
||||
- 67 body chunks (default `max_words=800` collapses to 24 single-chunk
|
||||
chapters once `max_words=2000`)
|
||||
- All 24 roman-numeral chapters detected and assigned stable IDs
|
||||
`chapter-01` .. `chapter-24`; multi-part chapters get
|
||||
`chapter-NN-part-001`, `chapter-NN-part-002`, ...
|
||||
- Chapter labels resolved from the EPUB nav doc (when present) and from
|
||||
the first in-document `<h2>`/`<h1>` heading
|
||||
- Project Gutenberg page-title collapse is gone: each chunk's title is the
|
||||
chapter label, not the shared book title
|
||||
- TOC body section ("Contents") reclassified to `toc`; transcriber's notes
|
||||
section reclassified to `notes`; section-role histogram is now
|
||||
`body=67, cover=1, header=1, toc=1, notes=1, footer=2`
|
||||
- Page anchors of the form `id="Page_N"` are preserved per chunk via the
|
||||
`page_anchors` provenance field (e.g. chapter-01 carries
|
||||
`Page_1..Page_14` distributed across its three parts)
|
||||
- Optional `overlap_words` parameter supports evidence-window context
|
||||
between adjacent parts of the same chapter without duplicating headings
|
||||
|
||||
@@ -261,6 +261,9 @@ def _register_source_chunks(root: Path, chunks: list[SourceChunk]) -> None:
|
||||
"section_role": chunk.section_role,
|
||||
"spine_index": chunk.spine_index,
|
||||
"book_metadata": dict(chunk.book_metadata),
|
||||
"chapter_label": chunk.chapter_label,
|
||||
"chapter_number": chunk.chapter_number,
|
||||
"page_anchors": list(chunk.page_anchors),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@@ -13,16 +13,30 @@ from xml.etree import ElementTree as ET
|
||||
from .errors import InfospaceError
|
||||
from .semantics import slugify
|
||||
|
||||
EXTRACTOR_VERSION = "generic-source-intake-v2"
|
||||
EXTRACTOR_VERSION = "generic-source-intake-v3"
|
||||
SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"}
|
||||
HTML_TITLE_RE = re.compile(r"<title[^>]*>(?P<title>.*?)</title>", re.I | re.S)
|
||||
HTML_H1_RE = re.compile(r"<h1[^>]*>(?P<title>.*?)</h1>", re.I | re.S)
|
||||
HTML_FIRST_HEADING_RE = re.compile(
|
||||
r"<(?P<tag>h[1-6])[^>]*>(?P<title>.*?)</(?P=tag)>", re.I | re.S
|
||||
)
|
||||
SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S)
|
||||
TAG_RE = re.compile(r"<[^>]+>")
|
||||
ANCHOR_OPEN_TAG_RE = re.compile(
|
||||
r"""<(?P<name>[a-zA-Z][a-zA-Z0-9:-]*)[^>]*\bid=(?:"(?P<danchor>(?:Page_|page-|pg-|p-)[^"]*)"|'(?P<sanchor>(?:Page_|page-|pg-|p-)[^']*)')[^>]*>""",
|
||||
re.I,
|
||||
)
|
||||
ANCHOR_MARKER_RE = re.compile(r"⟦anchor:(?P<anchor>[^⟧]+)⟧")
|
||||
ROMAN_NUMERAL_RE = re.compile(r"^([MDCLXVI]+)\.?$", re.I)
|
||||
CHAPTER_NUMBER_RE = re.compile(
|
||||
r"^chapter\s+(?P<value>[ivxlcdm]+|\d+)\b",
|
||||
re.I,
|
||||
)
|
||||
|
||||
OPF_NS = "http://www.idpf.org/2007/opf"
|
||||
DC_NS = "http://purl.org/dc/elements/1.1/"
|
||||
CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container"
|
||||
XHTML_NS = "http://www.w3.org/1999/xhtml"
|
||||
|
||||
SECTION_ROLE_BODY = "body"
|
||||
SECTION_ROLE_COVER = "cover"
|
||||
@@ -34,6 +48,24 @@ SECTION_ROLE_NOTES = "notes"
|
||||
SECTION_ROLE_LICENSE = "license"
|
||||
SECTION_ROLE_AUXILIARY = "auxiliary"
|
||||
|
||||
TOC_LABEL_TOKENS = {"contents", "table of contents", "toc"}
|
||||
NOTES_LABEL_TOKENS = {
|
||||
"transcribers notes",
|
||||
"transcriber notes",
|
||||
"transcribers note",
|
||||
"transcribers comments",
|
||||
"editors notes",
|
||||
"editor notes",
|
||||
}
|
||||
LICENSE_LABEL_TOKENS = {
|
||||
"license",
|
||||
"project gutenberg license",
|
||||
"the project gutenberg license",
|
||||
"license terms",
|
||||
"copyright",
|
||||
"colophon",
|
||||
}
|
||||
|
||||
PG_START_MARKERS = ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK")
|
||||
PG_END_MARKERS = ("*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK")
|
||||
|
||||
@@ -53,6 +85,9 @@ class SourceChunk:
|
||||
section_role: str = SECTION_ROLE_BODY
|
||||
spine_index: int | None = None
|
||||
book_metadata: dict = field(default_factory=dict)
|
||||
chapter_label: str | None = None
|
||||
chapter_number: int | None = None
|
||||
page_anchors: tuple = ()
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return asdict(self)
|
||||
@@ -68,6 +103,8 @@ class _SourceDocument:
|
||||
section_role: str = SECTION_ROLE_BODY
|
||||
spine_index: int | None = None
|
||||
book_metadata: dict = field(default_factory=dict)
|
||||
chapter_label: str | None = None
|
||||
chapter_number: int | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -90,6 +127,7 @@ def normalize_source(
|
||||
max_words: int = 800,
|
||||
max_chunks: int | None = None,
|
||||
include_non_body: bool = False,
|
||||
overlap_words: int = 0,
|
||||
) -> list[SourceChunk]:
|
||||
source_path = Path(source)
|
||||
if not source_path.exists():
|
||||
@@ -112,27 +150,31 @@ def normalize_source(
|
||||
chunks: list[SourceChunk] = []
|
||||
used_ids: set[str] = set()
|
||||
for document in documents:
|
||||
pieces = _chunk_markdown(document.markdown, max_words=max_words)
|
||||
for index, piece in enumerate(pieces):
|
||||
title = document.title if len(pieces) == 1 else f"{document.title} Part {index + 1}"
|
||||
pieces = _split_document(document, max_words=max_words, overlap_words=overlap_words)
|
||||
for index, (part_title, part_markdown, part_anchors) in enumerate(pieces):
|
||||
base_id = (
|
||||
document.base_slug if len(pieces) == 1 else f"{document.base_slug}-part-{index + 1:03d}"
|
||||
document.base_slug
|
||||
if len(pieces) == 1
|
||||
else f"{document.base_slug}-part-{index + 1:03d}"
|
||||
)
|
||||
chunk_id = _dedupe_chunk_id(base_id, used_ids)
|
||||
chunks.append(
|
||||
SourceChunk(
|
||||
chunk_id=chunk_id,
|
||||
title=title,
|
||||
markdown=piece,
|
||||
title=part_title,
|
||||
markdown=part_markdown,
|
||||
source_type=document.source_type,
|
||||
original_path=document.original_path,
|
||||
digest=_digest_text(piece),
|
||||
digest=_digest_text(part_markdown),
|
||||
chunk_index=index,
|
||||
chunk_count=len(pieces),
|
||||
imported_at=imported_at,
|
||||
section_role=document.section_role,
|
||||
spine_index=document.spine_index,
|
||||
book_metadata=dict(document.book_metadata),
|
||||
chapter_label=document.chapter_label,
|
||||
chapter_number=document.chapter_number,
|
||||
page_anchors=tuple(part_anchors),
|
||||
)
|
||||
)
|
||||
if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks:
|
||||
@@ -342,6 +384,9 @@ def _epub3_spine_documents(
|
||||
metadata, manifest, spine = _parse_opf(archive, opf_path)
|
||||
book_title = metadata.get("title") or _title_from_path(source_path)
|
||||
book_slug = slugify(book_title) or slugify(source_path.stem) or "ebook"
|
||||
nav_labels = _load_nav_labels(archive, manifest)
|
||||
chapter_counter = 0
|
||||
used_chapter_numbers: set[int] = set()
|
||||
for spine_index, entry in enumerate(spine):
|
||||
item = manifest.get(entry.item_id)
|
||||
if item is None or not item.href:
|
||||
@@ -351,24 +396,59 @@ def _epub3_spine_documents(
|
||||
except KeyError:
|
||||
continue
|
||||
role = _classify_section(item, entry, raw)
|
||||
nav_label = nav_labels.get(item.href, "")
|
||||
heading_label = _first_heading_text(raw) if Path(item.href).suffix.lower() not in {".txt", ".md"} else ""
|
||||
chapter_label = nav_label or heading_label or None
|
||||
# Reclassify body sections whose chapter label matches known noise tokens
|
||||
if role == SECTION_ROLE_BODY and chapter_label:
|
||||
normalized_label = re.sub(r"[^a-z0-9 ]+", "", chapter_label.lower()).strip()
|
||||
if normalized_label in TOC_LABEL_TOKENS:
|
||||
role = SECTION_ROLE_TOC
|
||||
elif normalized_label in NOTES_LABEL_TOKENS:
|
||||
role = SECTION_ROLE_NOTES
|
||||
elif normalized_label in LICENSE_LABEL_TOKENS:
|
||||
role = SECTION_ROLE_LICENSE
|
||||
if role != SECTION_ROLE_BODY and not include_non_body:
|
||||
continue
|
||||
chapter_number: int | None = None
|
||||
if role == SECTION_ROLE_BODY and chapter_label:
|
||||
chapter_number = _parse_chapter_number(chapter_label)
|
||||
if role == SECTION_ROLE_BODY and chapter_number is None:
|
||||
# Fall back to sequential body counter when chapter label exists but
|
||||
# is not a roman/arabic numeral (e.g. "Preface").
|
||||
if chapter_label:
|
||||
chapter_counter += 1
|
||||
# Use sequential only when no other body has claimed this slot;
|
||||
# roman-numeral chapters take precedence and may overlap, so we
|
||||
# leave chapter_number=None for non-numeric labels and let the
|
||||
# slug fall back to the label slug.
|
||||
if chapter_number is not None:
|
||||
if chapter_number in used_chapter_numbers:
|
||||
# Duplicate numeric label across the book — keep label, drop the
|
||||
# numeric slot so the slug falls back to label-based naming.
|
||||
chapter_number = None
|
||||
else:
|
||||
used_chapter_numbers.add(chapter_number)
|
||||
suffix = Path(item.href).suffix.lower()
|
||||
if suffix in {".txt", ".md"}:
|
||||
title = _markdown_title(raw) or _title_from_path(Path(item.href))
|
||||
title = chapter_label or _markdown_title(raw) or _title_from_path(Path(item.href))
|
||||
markdown_body = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title)
|
||||
else:
|
||||
title = _html_title(raw) or _title_from_path(Path(item.href))
|
||||
text = _html_to_text(raw)
|
||||
title = chapter_label or _html_title(raw) or _title_from_path(Path(item.href))
|
||||
marked = _inject_anchor_markers(raw)
|
||||
text = _html_to_text(marked)
|
||||
if text.lower().startswith(title.lower()):
|
||||
text = text[len(title) :].strip()
|
||||
markdown_body = f"# {title}\n\n{text}\n"
|
||||
section_slug = (
|
||||
slugify(title)
|
||||
or slugify(Path(item.href).stem)
|
||||
or f"section-{spine_index + 1:03d}"
|
||||
base_slug = _chapter_base_slug(
|
||||
role=role,
|
||||
chapter_number=chapter_number,
|
||||
chapter_label=chapter_label,
|
||||
book_slug=book_slug,
|
||||
spine_index=spine_index,
|
||||
href=item.href,
|
||||
title=title,
|
||||
)
|
||||
base_slug = f"{book_slug}-{spine_index + 1:03d}-{section_slug}"
|
||||
yield _SourceDocument(
|
||||
title=title,
|
||||
markdown=markdown_body,
|
||||
@@ -378,9 +458,119 @@ def _epub3_spine_documents(
|
||||
section_role=role,
|
||||
spine_index=spine_index,
|
||||
book_metadata=metadata,
|
||||
chapter_label=chapter_label,
|
||||
chapter_number=chapter_number,
|
||||
)
|
||||
|
||||
|
||||
def _chapter_base_slug(
|
||||
*,
|
||||
role: str,
|
||||
chapter_number: int | None,
|
||||
chapter_label: str | None,
|
||||
book_slug: str,
|
||||
spine_index: int,
|
||||
href: str,
|
||||
title: str,
|
||||
) -> str:
|
||||
if role == SECTION_ROLE_BODY and chapter_number is not None:
|
||||
return f"chapter-{chapter_number:02d}"
|
||||
if role == SECTION_ROLE_BODY and chapter_label:
|
||||
return f"chapter-{slugify(chapter_label) or f'section-{spine_index + 1:03d}'}"
|
||||
section_slug = (
|
||||
slugify(title)
|
||||
or slugify(Path(href).stem)
|
||||
or f"section-{spine_index + 1:03d}"
|
||||
)
|
||||
return f"{book_slug}-{spine_index + 1:03d}-{section_slug}"
|
||||
|
||||
|
||||
def _load_nav_labels(
|
||||
archive: zipfile.ZipFile, manifest: dict[str, _EpubManifestItem]
|
||||
) -> dict[str, str]:
|
||||
nav_item = next(
|
||||
(item for item in manifest.values() if "nav" in item.properties),
|
||||
None,
|
||||
)
|
||||
if nav_item is None:
|
||||
return {}
|
||||
try:
|
||||
raw = archive.read(nav_item.href).decode("utf-8", errors="replace")
|
||||
except KeyError:
|
||||
return {}
|
||||
base = _zip_dirname(nav_item.href)
|
||||
labels: dict[str, str] = {}
|
||||
pattern = re.compile(
|
||||
r"""<a[^>]*\bhref=(?:"(?P<dhref>[^"#]+)(?:#[^"]*)?"|'(?P<shref>[^'#]+)(?:#[^']*)?')[^>]*>(?P<label>.*?)</a>""",
|
||||
re.I | re.S,
|
||||
)
|
||||
for match in pattern.finditer(raw):
|
||||
href_attr = match.group("dhref") or match.group("shref") or ""
|
||||
if not href_attr:
|
||||
continue
|
||||
label = _collapse_ws(_html_to_text(match.group("label")))
|
||||
if not label:
|
||||
continue
|
||||
resolved = _join_zip_path(base, href_attr)
|
||||
labels.setdefault(resolved, label)
|
||||
return labels
|
||||
|
||||
|
||||
def _first_heading_text(html_raw: str) -> str:
|
||||
match = HTML_FIRST_HEADING_RE.search(html_raw)
|
||||
if not match:
|
||||
return ""
|
||||
return _collapse_ws(_html_to_text(match.group("title")))
|
||||
|
||||
|
||||
def _parse_chapter_number(label: str) -> int | None:
|
||||
stripped = label.strip()
|
||||
if not stripped:
|
||||
return None
|
||||
if stripped.isdigit():
|
||||
return int(stripped)
|
||||
roman_match = ROMAN_NUMERAL_RE.match(stripped)
|
||||
if roman_match:
|
||||
value = _roman_to_int(roman_match.group(1).upper())
|
||||
if value > 0:
|
||||
return value
|
||||
chapter_match = CHAPTER_NUMBER_RE.match(stripped)
|
||||
if chapter_match:
|
||||
token = chapter_match.group("value")
|
||||
if token.isdigit():
|
||||
return int(token)
|
||||
value = _roman_to_int(token.upper())
|
||||
if value > 0:
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _roman_to_int(value: str) -> int:
|
||||
table = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
|
||||
total = 0
|
||||
prev = 0
|
||||
for ch in reversed(value):
|
||||
cur = table.get(ch, 0)
|
||||
if cur == 0:
|
||||
return 0
|
||||
if cur < prev:
|
||||
total -= cur
|
||||
else:
|
||||
total += cur
|
||||
prev = cur
|
||||
return total
|
||||
|
||||
|
||||
def _inject_anchor_markers(raw: str) -> str:
|
||||
def repl(match: re.Match) -> str:
|
||||
anchor = match.group("danchor") or match.group("sanchor") or ""
|
||||
if not anchor:
|
||||
return match.group(0)
|
||||
return f"{match.group(0)} ⟦anchor:{anchor}⟧ "
|
||||
|
||||
return ANCHOR_OPEN_TAG_RE.sub(repl, raw)
|
||||
|
||||
|
||||
def _epub_legacy_documents(
|
||||
archive: zipfile.ZipFile, source_path: Path
|
||||
) -> Iterable[_SourceDocument]:
|
||||
@@ -464,20 +654,72 @@ def _join_zip_path(base: str, href: str) -> str:
|
||||
return f"{base}/{href}"
|
||||
|
||||
|
||||
def _chunk_markdown(markdown: str, *, max_words: int) -> list[str]:
|
||||
text = markdown.strip()
|
||||
if max_words <= 0:
|
||||
return [text + "\n"]
|
||||
words = text.split()
|
||||
if len(words) <= max_words:
|
||||
return [text + "\n"]
|
||||
chunks: list[str] = []
|
||||
heading = _markdown_title(text) or "Source"
|
||||
body_words = re.sub(r"(?m)^# .+?\n+", "", text, count=1).split()
|
||||
for start in range(0, len(body_words), max_words):
|
||||
part = " ".join(body_words[start : start + max_words]).strip()
|
||||
chunks.append(f"# {heading} Part {len(chunks) + 1}\n\n{part}\n")
|
||||
return chunks
|
||||
def _split_document(
|
||||
document: _SourceDocument,
|
||||
*,
|
||||
max_words: int,
|
||||
overlap_words: int,
|
||||
) -> list[tuple[str, str, list[str]]]:
|
||||
text = document.markdown.strip()
|
||||
heading = _markdown_title(text) or document.title or "Source"
|
||||
body_with_markers = re.sub(r"(?m)^# .+?\n+", "", text, count=1).strip()
|
||||
clean_body, anchor_positions = _extract_anchor_positions(body_with_markers)
|
||||
words = clean_body.split()
|
||||
if max_words <= 0 or len(words) <= max_words:
|
||||
anchors = [name for name, _idx in anchor_positions]
|
||||
return [(document.title, _compose_chunk(heading, clean_body), anchors)]
|
||||
overlap = max(0, min(overlap_words, max_words - 1))
|
||||
step = max_words - overlap if overlap > 0 else max_words
|
||||
parts: list[tuple[str, str, list[str]]] = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = min(start + max_words, len(words))
|
||||
slice_words = words[start:end]
|
||||
if not slice_words:
|
||||
break
|
||||
part_index = len(parts) + 1
|
||||
part_text = " ".join(slice_words).strip()
|
||||
part_anchors = _anchors_in_range(anchor_positions, start, end)
|
||||
part_title = f"{document.title} Part {part_index}"
|
||||
parts.append((part_title, _compose_chunk(heading, part_text), part_anchors))
|
||||
if end >= len(words):
|
||||
break
|
||||
start += step
|
||||
return parts
|
||||
|
||||
|
||||
def _compose_chunk(heading: str, body: str) -> str:
|
||||
body = body.strip()
|
||||
if not body:
|
||||
return f"# {heading}\n"
|
||||
return f"# {heading}\n\n{body}\n"
|
||||
|
||||
|
||||
def _extract_anchor_positions(text: str) -> tuple[str, list[tuple[str, int]]]:
|
||||
parts: list[str] = []
|
||||
anchors: list[tuple[str, int]] = []
|
||||
cursor = 0
|
||||
for match in ANCHOR_MARKER_RE.finditer(text):
|
||||
prefix = text[cursor : match.start()]
|
||||
parts.append(prefix)
|
||||
word_index = sum(len(part.split()) for part in parts)
|
||||
anchors.append((match.group("anchor"), word_index))
|
||||
cursor = match.end()
|
||||
parts.append(text[cursor:])
|
||||
cleaned = re.sub(r"[ \t]{2,}", " ", "".join(parts)).strip()
|
||||
return cleaned, anchors
|
||||
|
||||
|
||||
def _anchors_in_range(
|
||||
anchor_positions: list[tuple[str, int]], start: int, end: int
|
||||
) -> list[str]:
|
||||
seen: set[str] = set()
|
||||
found: list[str] = []
|
||||
for name, idx in anchor_positions:
|
||||
if start <= idx < end and name not in seen:
|
||||
seen.add(name)
|
||||
found.append(name)
|
||||
return found
|
||||
|
||||
|
||||
def _html_title(raw: str) -> str:
|
||||
@@ -488,7 +730,8 @@ def _html_title(raw: str) -> str:
|
||||
|
||||
|
||||
def _html_to_text(raw: str) -> str:
|
||||
cleaned = SCRIPT_STYLE_RE.sub(" ", raw)
|
||||
cleaned = re.sub(r"<head\b[^>]*>.*?</head>", " ", raw, flags=re.I | re.S)
|
||||
cleaned = SCRIPT_STYLE_RE.sub(" ", cleaned)
|
||||
cleaned = re.sub(r"</(p|div|section|article|h[1-6]|li)>", "\n", cleaned, flags=re.I)
|
||||
cleaned = TAG_RE.sub(" ", cleaned)
|
||||
cleaned = html.unescape(cleaned)
|
||||
|
||||
@@ -117,10 +117,12 @@ def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path
|
||||
assert [chunk.spine_index for chunk in chunks] == [3, 4]
|
||||
assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
|
||||
assert all(chunk.source_type == "epub" for chunk in chunks)
|
||||
# Chunk IDs must be stable, ordered, and not collapse to the book title.
|
||||
assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-")
|
||||
assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-")
|
||||
assert chunks[0].chunk_id != chunks[1].chunk_id
|
||||
# Stable chapter-NN IDs from in-document heading parsing, not collapsed
|
||||
# to the Project Gutenberg page title.
|
||||
assert [chunk.chunk_id for chunk in chunks] == ["chapter-01", "chapter-02"]
|
||||
assert [chunk.chapter_label for chunk in chunks] == ["Chapter I", "Chapter II"]
|
||||
assert [chunk.chapter_number for chunk in chunks] == [1, 2]
|
||||
assert "Chapter I" not in chunks[0].markdown.split("\n", 1)[1]
|
||||
|
||||
|
||||
def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None:
|
||||
@@ -171,3 +173,145 @@ def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: P
|
||||
assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
|
||||
assert all(chunk.spine_index is None for chunk in chunks)
|
||||
assert all(chunk.book_metadata == {} for chunk in chunks)
|
||||
assert all(chunk.chapter_number is None for chunk in chunks)
|
||||
|
||||
|
||||
ROMAN_PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?>
|
||||
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||
<dc:identifier id="bookid">urn:test:roman</dc:identifier>
|
||||
<dc:title>Roman Chapters Book</dc:title>
|
||||
<dc:creator>Test Author</dc:creator>
|
||||
<dc:language>en</dc:language>
|
||||
</metadata>
|
||||
<manifest>
|
||||
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
|
||||
<item id="contents" href="contents.xhtml" media-type="application/xhtml+xml"/>
|
||||
<item id="ch1" href="ch1.xhtml" media-type="application/xhtml+xml"/>
|
||||
<item id="ch2" href="ch2.xhtml" media-type="application/xhtml+xml"/>
|
||||
<item id="ch3" href="ch3.xhtml" media-type="application/xhtml+xml"/>
|
||||
</manifest>
|
||||
<spine>
|
||||
<itemref idref="nav" linear="no"/>
|
||||
<itemref idref="contents"/>
|
||||
<itemref idref="ch1"/>
|
||||
<itemref idref="ch2"/>
|
||||
<itemref idref="ch3"/>
|
||||
</spine>
|
||||
</package>
|
||||
"""
|
||||
|
||||
|
||||
def _write_roman_chapter_epub(path: Path, *, ch1_words: int = 50) -> None:
|
||||
long_body = " ".join(f"word{i}" for i in range(ch1_words))
|
||||
with zipfile.ZipFile(path, "w") as archive:
|
||||
archive.writestr("mimetype", "application/epub+zip")
|
||||
archive.writestr("META-INF/container.xml", CONTAINER_XML)
|
||||
archive.writestr("OEBPS/content.opf", ROMAN_PACKAGE_OPF)
|
||||
archive.writestr(
|
||||
"OEBPS/nav.xhtml",
|
||||
"<html><head><title>TOC</title></head><body>"
|
||||
"<nav epub:type='toc'><ol>"
|
||||
"<li><a href='contents.xhtml'>Contents</a></li>"
|
||||
"<li><a href='ch1.xhtml'>Foreword</a></li>"
|
||||
"<li><a href='ch2.xhtml'>II</a></li>"
|
||||
"<li><a href='ch3.xhtml'>III</a></li>"
|
||||
"</ol></nav></body></html>",
|
||||
)
|
||||
archive.writestr(
|
||||
"OEBPS/contents.xhtml",
|
||||
"<html><head><title>Book</title></head>"
|
||||
"<body><h2>Contents</h2><p>Listing.</p></body></html>",
|
||||
)
|
||||
archive.writestr(
|
||||
"OEBPS/ch1.xhtml",
|
||||
"<html><head><title>Book</title></head>"
|
||||
"<body><h2>I</h2>"
|
||||
"<p>"
|
||||
f'<span id="Page_1">1</span> {long_body} '
|
||||
f'<span id="Page_2">2</span> tail tail tail.'
|
||||
"</p></body></html>",
|
||||
)
|
||||
archive.writestr(
|
||||
"OEBPS/ch2.xhtml",
|
||||
"<html><head><title>Book</title></head>"
|
||||
"<body><h2>II</h2><p>Short chapter two.</p></body></html>",
|
||||
)
|
||||
archive.writestr(
|
||||
"OEBPS/ch3.xhtml",
|
||||
"<html><head><title>Book</title></head>"
|
||||
"<body><h2>III</h2><p>Short chapter three.</p></body></html>",
|
||||
)
|
||||
|
||||
|
||||
def test_epub3_intake_assigns_stable_chapter_ids_from_roman_headings(tmp_path: Path) -> None:
|
||||
book = tmp_path / "roman.epub"
|
||||
_write_roman_chapter_epub(book)
|
||||
|
||||
chunks = normalize_source(book)
|
||||
|
||||
# nav says ch1="Foreword" so chapter_number stays None there (non-numeric label).
|
||||
# ch2/ch3 nav says II/III which match the in-document heading and parse to 2/3.
|
||||
# contents.xhtml has heading "Contents" which reclassifies as toc (non-body) and is dropped by default.
|
||||
assert [chunk.chapter_label for chunk in chunks] == ["Foreword", "II", "III"]
|
||||
assert [chunk.chapter_number for chunk in chunks] == [None, 2, 3]
|
||||
assert [chunk.chunk_id for chunk in chunks] == [
|
||||
"chapter-foreword",
|
||||
"chapter-02",
|
||||
"chapter-03",
|
||||
]
|
||||
|
||||
|
||||
def test_epub3_intake_reclassifies_contents_body_section_when_included(tmp_path: Path) -> None:
|
||||
book = tmp_path / "roman.epub"
|
||||
_write_roman_chapter_epub(book)
|
||||
|
||||
chunks = normalize_source(book, include_non_body=True)
|
||||
|
||||
contents = next(chunk for chunk in chunks if chunk.spine_index == 1)
|
||||
assert contents.section_role == "toc"
|
||||
assert contents.chapter_label == "Contents"
|
||||
|
||||
|
||||
def test_epub3_intake_splits_long_chapter_into_parts_with_anchors(tmp_path: Path) -> None:
|
||||
book = tmp_path / "roman.epub"
|
||||
_write_roman_chapter_epub(book, ch1_words=300)
|
||||
|
||||
chunks = normalize_source(book, max_words=120)
|
||||
|
||||
foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"]
|
||||
assert len(foreword_parts) >= 2
|
||||
assert [chunk.chunk_id for chunk in foreword_parts] == [
|
||||
f"chapter-foreword-part-{i + 1:03d}" for i in range(len(foreword_parts))
|
||||
]
|
||||
# Each part keeps the chapter heading and is named by chapter + part suffix.
|
||||
assert all(chunk.markdown.startswith("# Foreword\n") for chunk in foreword_parts)
|
||||
assert all(chunk.title.startswith("Foreword") for chunk in foreword_parts)
|
||||
|
||||
# Page anchors land on the parts whose word range contains them.
|
||||
first_part_anchors = foreword_parts[0].page_anchors
|
||||
last_part_anchors = foreword_parts[-1].page_anchors
|
||||
assert "Page_1" in first_part_anchors
|
||||
assert "Page_2" in last_part_anchors
|
||||
# Anchor markers must not leak into the final markdown text.
|
||||
assert "⟦anchor:" not in foreword_parts[0].markdown
|
||||
|
||||
|
||||
def test_epub3_intake_supports_word_overlap_between_chapter_parts(tmp_path: Path) -> None:
|
||||
book = tmp_path / "roman.epub"
|
||||
_write_roman_chapter_epub(book, ch1_words=200)
|
||||
|
||||
chunks = normalize_source(book, max_words=80, overlap_words=20)
|
||||
|
||||
foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"]
|
||||
assert len(foreword_parts) >= 2
|
||||
|
||||
def _body_words(markdown: str) -> list[str]:
|
||||
body = markdown.split("\n", 2)[2] if markdown.count("\n") >= 2 else ""
|
||||
return body.split()
|
||||
|
||||
first = _body_words(foreword_parts[0].markdown)
|
||||
second = _body_words(foreword_parts[1].markdown)
|
||||
# The trailing overlap_words of the first part must reappear verbatim at
|
||||
# the head of the next part.
|
||||
assert first[-20:] == second[:20]
|
||||
|
||||
@@ -99,7 +99,7 @@ state_hub_task_id: "a672fcf9-1b80-4faf-b16d-84ca52601dc9"
|
||||
|
||||
```task
|
||||
id: IB-WP-0016-T02
|
||||
status: in_progress
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "47de1110-36d0-4d63-bf87-389746509e03"
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user