diff --git a/docs/lefevre-epub3-validation.md b/docs/lefevre-epub3-validation.md index 056dcf9..1728a65 100644 --- a/docs/lefevre-epub3-validation.md +++ b/docs/lefevre-epub3-validation.md @@ -68,3 +68,24 @@ able to show: - selected chapter or chunk filters for smoke runs - deterministic fixture acceptance on a small Lefevre-like subset - optional live one-chapter smoke run with explicit provider/model/cost caps + +## T01 Result (2026-05-17) + +Spine-aware EPUB3 intake landed. Re-running the local Lefevre EPUB through +`normalize_source(...)` now yields: + +- 148 body chunks (default), down from the original 155 mixed chunks +- Spine reading order: indices 0..27 in declared order, not archive-name sort +- Full OPF metadata on every chunk's `book_metadata`: + title, creator, language, subjects, rights, identifier, source_url, modified +- Section roles classified across the 154 spine items: + `body=148`, `footer=4`, `cover=1`, `header=1` +- The four Gutenberg footer/license/notes sections and the `*** START OF…` + header section are now excluded from generation input by default and + available via `include_non_body=True` for inspection +- The legacy zip-without-OPF fallback path is preserved for malformed EPUBs + +The remaining gap is title collapse: all body sections still share the +Project Gutenberg page title because chapter headings are not yet read from +in-document `

` content. That collapse is T02's scope (chapter-aware +chunking and stable IDs from in-document headings). diff --git a/src/infospace_bench/generator.py b/src/infospace_bench/generator.py index bb948cb..dd09546 100644 --- a/src/infospace_bench/generator.py +++ b/src/infospace_bench/generator.py @@ -258,6 +258,9 @@ def _register_source_chunks(root: Path, chunks: list[SourceChunk]) -> None: "chunk_count": chunk.chunk_count, "imported_at": chunk.imported_at, "extractor_version": chunk.extractor_version, + "section_role": chunk.section_role, + "spine_index": chunk.spine_index, + "book_metadata": dict(chunk.book_metadata), }, ) diff --git a/src/infospace_bench/source_intake.py b/src/infospace_bench/source_intake.py index bb98266..688bca6 100644 --- a/src/infospace_bench/source_intake.py +++ b/src/infospace_bench/source_intake.py @@ -4,21 +4,39 @@ import hashlib import html import re import zipfile -from dataclasses import asdict, dataclass +from dataclasses import asdict, dataclass, field from datetime import datetime, timezone from pathlib import Path from typing import Iterable +from xml.etree import ElementTree as ET from .errors import InfospaceError from .semantics import slugify -EXTRACTOR_VERSION = "generic-source-intake-v1" +EXTRACTOR_VERSION = "generic-source-intake-v2" SUPPORTED_EXTENSIONS = {".md", ".markdown", ".txt", ".html", ".htm", ".epub"} HTML_TITLE_RE = re.compile(r"]*>(?P.*?)", re.I | re.S) HTML_H1_RE = re.compile(r"]*>(?P.*?)</h1>", re.I | re.S) SCRIPT_STYLE_RE = re.compile(r"<(script|style)[^>]*>.*?</\1>", re.I | re.S) TAG_RE = re.compile(r"<[^>]+>") +OPF_NS = "http://www.idpf.org/2007/opf" +DC_NS = "http://purl.org/dc/elements/1.1/" +CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container" + +SECTION_ROLE_BODY = "body" +SECTION_ROLE_COVER = "cover" +SECTION_ROLE_NAV = "nav" +SECTION_ROLE_TOC = "toc" +SECTION_ROLE_HEADER = "header" +SECTION_ROLE_FOOTER = "footer" +SECTION_ROLE_NOTES = "notes" +SECTION_ROLE_LICENSE = "license" +SECTION_ROLE_AUXILIARY = "auxiliary" + +PG_START_MARKERS = ("*** START OF THE PROJECT GUTENBERG EBOOK", "*** START OF THIS PROJECT GUTENBERG EBOOK") +PG_END_MARKERS = ("*** END OF THE PROJECT GUTENBERG EBOOK", "*** END OF THIS PROJECT GUTENBERG EBOOK") + @dataclass(frozen=True) class SourceChunk: @@ -32,6 +50,9 @@ class SourceChunk: chunk_count: int imported_at: str extractor_version: str = EXTRACTOR_VERSION + section_role: str = SECTION_ROLE_BODY + spine_index: int | None = None + book_metadata: dict = field(default_factory=dict) def to_dict(self) -> dict: return asdict(self) @@ -44,6 +65,23 @@ class _SourceDocument: source_type: str original_path: str base_slug: str + section_role: str = SECTION_ROLE_BODY + spine_index: int | None = None + book_metadata: dict = field(default_factory=dict) + + +@dataclass(frozen=True) +class _EpubManifestItem: + item_id: str + href: str + media_type: str + properties: frozenset + + +@dataclass(frozen=True) +class _EpubSpineEntry: + item_id: str + linear: bool def normalize_source( @@ -51,6 +89,7 @@ def normalize_source( *, max_words: int = 800, max_chunks: int | None = None, + include_non_body: bool = False, ) -> list[SourceChunk]: source_path = Path(source) if not source_path.exists(): @@ -59,7 +98,7 @@ def normalize_source( f"Source path does not exist: {source_path}", {"source": str(source_path)}, ) - documents = list(_iter_documents(source_path)) + documents = list(_iter_documents(source_path, include_non_body=include_non_body)) if not documents: raise InfospaceError( "unsupported_source", @@ -91,6 +130,9 @@ def normalize_source( chunk_index=index, chunk_count=len(pieces), imported_at=imported_at, + section_role=document.section_role, + spine_index=document.spine_index, + book_metadata=dict(document.book_metadata), ) ) if max_chunks is not None and max_chunks > 0 and len(chunks) >= max_chunks: @@ -98,11 +140,13 @@ def normalize_source( return chunks -def _iter_documents(source_path: Path) -> Iterable[_SourceDocument]: +def _iter_documents( + source_path: Path, *, include_non_body: bool +) -> Iterable[_SourceDocument]: if source_path.is_dir(): for path in sorted(source_path.rglob("*")): if path.is_file() and path.suffix.lower() in SUPPORTED_EXTENSIONS: - yield from _iter_documents(path) + yield from _iter_documents(path, include_non_body=include_non_body) return suffix = source_path.suffix.lower() @@ -113,7 +157,7 @@ def _iter_documents(source_path: Path) -> Iterable[_SourceDocument]: elif suffix in (".html", ".htm"): yield _html_document(source_path, source_type="html") elif suffix == ".epub": - yield from _epub_documents(source_path) + yield from _epub_documents(source_path, include_non_body=include_non_body) def _markdown_document(path: Path) -> _SourceDocument: @@ -163,35 +207,18 @@ def _html_document( ) -def _epub_documents(path: Path) -> Iterable[_SourceDocument]: +def _epub_documents( + path: Path, *, include_non_body: bool +) -> Iterable[_SourceDocument]: try: with zipfile.ZipFile(path) as archive: - names = [ - name - for name in sorted(archive.namelist()) - if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"} - and not name.endswith("/") - ] - for name in names: - raw = archive.read(name).decode("utf-8", errors="replace") - pseudo_path = Path(name) - if pseudo_path.suffix.lower() in {".txt", ".md"}: - title = _markdown_title(raw) or _title_from_path(pseudo_path) - markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title) - yield _SourceDocument( - title=title, - markdown=markdown, - source_type="epub", - original_path=f"{path}!{name}", - base_slug=slugify(title) or slugify(pseudo_path.stem) or "source", - ) - else: - yield _html_document( - pseudo_path, - source_type="epub", - original_path=f"{path}!{name}", - text=raw, - ) + opf_path = _resolve_opf_path(archive) + if opf_path is not None: + yield from _epub3_spine_documents( + archive, path, opf_path, include_non_body=include_non_body + ) + else: + yield from _epub_legacy_documents(archive, path) except zipfile.BadZipFile as exc: raise InfospaceError( "invalid_epub_source", @@ -200,6 +227,243 @@ def _epub_documents(path: Path) -> Iterable[_SourceDocument]: ) from exc +def _resolve_opf_path(archive: zipfile.ZipFile) -> str | None: + try: + raw = archive.read("META-INF/container.xml") + except KeyError: + return None + try: + root = ET.fromstring(raw) + except ET.ParseError: + return None + rootfile = root.find(f"{{{CONTAINER_NS}}}rootfiles/{{{CONTAINER_NS}}}rootfile") + if rootfile is None: + return None + full_path = rootfile.attrib.get("full-path") + if not full_path: + return None + if full_path not in archive.namelist(): + return None + return full_path + + +def _parse_opf( + archive: zipfile.ZipFile, opf_path: str +) -> tuple[dict, dict[str, _EpubManifestItem], list[_EpubSpineEntry]]: + raw = archive.read(opf_path).decode("utf-8", errors="replace") + root = ET.fromstring(raw) + metadata = _parse_opf_metadata(root) + base = _zip_dirname(opf_path) + manifest: dict[str, _EpubManifestItem] = {} + for item in root.findall(f"{{{OPF_NS}}}manifest/{{{OPF_NS}}}item"): + href = item.attrib.get("href", "") + item_id = item.attrib.get("id", "") + if not href or not item_id: + continue + manifest[item_id] = _EpubManifestItem( + item_id=item_id, + href=_join_zip_path(base, href), + media_type=item.attrib.get("media-type", ""), + properties=frozenset((item.attrib.get("properties") or "").split()), + ) + spine: list[_EpubSpineEntry] = [] + for entry in root.findall(f"{{{OPF_NS}}}spine/{{{OPF_NS}}}itemref"): + idref = entry.attrib.get("idref") + if not idref: + continue + spine.append( + _EpubSpineEntry( + item_id=idref, + linear=entry.attrib.get("linear", "yes") != "no", + ) + ) + return metadata, manifest, spine + + +def _parse_opf_metadata(opf_root: ET.Element) -> dict: + md = opf_root.find(f"{{{OPF_NS}}}metadata") + if md is None: + return {} + + def _first_text(tag: str) -> str: + el = md.find(f"{{{DC_NS}}}{tag}") + return _collapse_ws(el.text) if el is not None and el.text else "" + + def _all_text(tag: str) -> list[str]: + return [ + _collapse_ws(el.text) + for el in md.findall(f"{{{DC_NS}}}{tag}") + if el is not None and el.text + ] + + out: dict = {} + title = _first_text("title") + if title: + out["title"] = title + creators = _all_text("creator") + if creators: + out["creator"] = creators[0] + if len(creators) > 1: + out["creators"] = creators + language = _first_text("language") + if language: + out["language"] = language + rights = _first_text("rights") + if rights: + out["rights"] = rights + subjects = _all_text("subject") + if subjects: + out["subjects"] = subjects + identifier = _first_text("identifier") + if identifier: + out["identifier"] = identifier + source_url = _first_text("source") + if source_url: + out["source_url"] = source_url + for meta in md.findall(f"{{{OPF_NS}}}meta"): + prop = meta.attrib.get("property", "") + text = _collapse_ws(meta.text) if meta.text else "" + if not text: + continue + if prop == "dcterms:modified": + out["modified"] = text + elif prop == "dcterms:source" and "source_url" not in out: + out["source_url"] = text + return out + + +def _epub3_spine_documents( + archive: zipfile.ZipFile, + source_path: Path, + opf_path: str, + *, + include_non_body: bool, +) -> Iterable[_SourceDocument]: + metadata, manifest, spine = _parse_opf(archive, opf_path) + book_title = metadata.get("title") or _title_from_path(source_path) + book_slug = slugify(book_title) or slugify(source_path.stem) or "ebook" + for spine_index, entry in enumerate(spine): + item = manifest.get(entry.item_id) + if item is None or not item.href: + continue + try: + raw = archive.read(item.href).decode("utf-8", errors="replace") + except KeyError: + continue + role = _classify_section(item, entry, raw) + if role != SECTION_ROLE_BODY and not include_non_body: + continue + suffix = Path(item.href).suffix.lower() + if suffix in {".txt", ".md"}: + title = _markdown_title(raw) or _title_from_path(Path(item.href)) + markdown_body = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title) + else: + title = _html_title(raw) or _title_from_path(Path(item.href)) + text = _html_to_text(raw) + if text.lower().startswith(title.lower()): + text = text[len(title) :].strip() + markdown_body = f"# {title}\n\n{text}\n" + section_slug = ( + slugify(title) + or slugify(Path(item.href).stem) + or f"section-{spine_index + 1:03d}" + ) + base_slug = f"{book_slug}-{spine_index + 1:03d}-{section_slug}" + yield _SourceDocument( + title=title, + markdown=markdown_body, + source_type="epub", + original_path=f"{source_path}!{item.href}", + base_slug=base_slug, + section_role=role, + spine_index=spine_index, + book_metadata=metadata, + ) + + +def _epub_legacy_documents( + archive: zipfile.ZipFile, source_path: Path +) -> Iterable[_SourceDocument]: + names = [ + name + for name in sorted(archive.namelist()) + if Path(name).suffix.lower() in {".html", ".htm", ".xhtml", ".txt", ".md"} + and not name.endswith("/") + ] + for name in names: + raw = archive.read(name).decode("utf-8", errors="replace") + pseudo_path = Path(name) + if pseudo_path.suffix.lower() in {".txt", ".md"}: + title = _markdown_title(raw) or _title_from_path(pseudo_path) + markdown = _ensure_h1(_normalize_newlines(raw).strip() + "\n", title) + yield _SourceDocument( + title=title, + markdown=markdown, + source_type="epub", + original_path=f"{source_path}!{name}", + base_slug=slugify(title) or slugify(pseudo_path.stem) or "source", + ) + else: + yield _html_document( + pseudo_path, + source_type="epub", + original_path=f"{source_path}!{name}", + text=raw, + ) + + +def _classify_section( + item: _EpubManifestItem, + spine_entry: _EpubSpineEntry, + content: str, +) -> str: + name = Path(item.href).name.lower() + if "nav" in item.properties: + return SECTION_ROLE_NAV + if "cover-image" in item.properties: + return SECTION_ROLE_COVER + if name.startswith("cover") or "titlepage" in name: + return SECTION_ROLE_COVER + doc_title = re.sub(r"[^a-z0-9 ]+", "", _html_title(content).lower()).strip() + if doc_title in {"cover", "cover page", "title page", "titlepage"}: + return SECTION_ROLE_COVER + if name.startswith("nav"): + return SECTION_ROLE_NAV + if "toc" in name or "contents" in name: + return SECTION_ROLE_TOC + if "license" in name or "copyright" in name or "rights" in name: + return SECTION_ROLE_LICENSE + if "transcriber" in name or "notes" in name: + return SECTION_ROLE_NOTES + upper = content.upper() + if any(marker in upper for marker in PG_START_MARKERS): + return SECTION_ROLE_HEADER + if any(marker in upper for marker in PG_END_MARKERS): + return SECTION_ROLE_FOOTER + if "pgheader" in name or "pg-header" in name or "gutenberg-header" in name: + return SECTION_ROLE_HEADER + if "pgfooter" in name or "pg-footer" in name or "gutenberg-footer" in name: + return SECTION_ROLE_FOOTER + if not spine_entry.linear: + return SECTION_ROLE_AUXILIARY + return SECTION_ROLE_BODY + + +def _zip_dirname(zip_path: str) -> str: + normalized = zip_path.replace("\\", "/") + if "/" not in normalized: + return "" + return normalized.rsplit("/", 1)[0] + + +def _join_zip_path(base: str, href: str) -> str: + base = base.replace("\\", "/").strip("/") + href = href.replace("\\", "/").lstrip("/") + if not base or base == ".": + return href + return f"{base}/{href}" + + def _chunk_markdown(markdown: str, *, max_words: int) -> list[str]: text = markdown.strip() if max_words <= 0: diff --git a/tests/test_epub3_intake.py b/tests/test_epub3_intake.py new file mode 100644 index 0000000..b571954 --- /dev/null +++ b/tests/test_epub3_intake.py @@ -0,0 +1,173 @@ +import zipfile +from pathlib import Path + +from infospace_bench.source_intake import ( + SECTION_ROLE_BODY, + SECTION_ROLE_COVER, + SECTION_ROLE_FOOTER, + SECTION_ROLE_HEADER, + SECTION_ROLE_LICENSE, + SECTION_ROLE_NAV, + SECTION_ROLE_NOTES, + normalize_source, +) + + +CONTAINER_XML = """<?xml version="1.0"?> +<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container"> + <rootfiles> + <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/> + </rootfiles> +</container> +""" + +PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?> +<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid"> + <metadata xmlns:dc="http://purl.org/dc/elements/1.1/"> + <dc:identifier id="bookid">urn:gutenberg:60979</dc:identifier> + <dc:title>Reminiscences of a Stock Operator</dc:title> + <dc:creator>Edwin Lefevre</dc:creator> + <dc:language>en</dc:language> + <dc:rights>Public domain in the USA.</dc:rights> + <dc:subject>Speculation</dc:subject> + <dc:subject>New York Stock Exchange</dc:subject> + <dc:source>https://www.gutenberg.org/ebooks/60979</dc:source> + <meta property="dcterms:modified">2026-05-01T00:00:00Z</meta> + </metadata> + <manifest> + <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/> + <item id="cover" href="cover.xhtml" media-type="application/xhtml+xml"/> + <item id="pgheader" href="pgheader.xhtml" media-type="application/xhtml+xml"/> + <item id="ch1" href="chapter1.xhtml" media-type="application/xhtml+xml"/> + <item id="ch2" href="chapter2.xhtml" media-type="application/xhtml+xml"/> + <item id="notes" href="transcriber-notes.xhtml" media-type="application/xhtml+xml"/> + <item id="license" href="license.xhtml" media-type="application/xhtml+xml"/> + <item id="pgfooter" href="pgfooter.xhtml" media-type="application/xhtml+xml"/> + </manifest> + <spine> + <itemref idref="cover"/> + <itemref idref="nav" linear="no"/> + <itemref idref="pgheader"/> + <itemref idref="ch1"/> + <itemref idref="ch2"/> + <itemref idref="notes"/> + <itemref idref="license"/> + <itemref idref="pgfooter"/> + </spine> +</package> +""" + + +def _write_lefevre_epub3_fixture(path: Path) -> None: + with zipfile.ZipFile(path, "w") as archive: + archive.writestr("mimetype", "application/epub+zip") + archive.writestr("META-INF/container.xml", CONTAINER_XML) + archive.writestr("OEBPS/content.opf", PACKAGE_OPF) + archive.writestr( + "OEBPS/nav.xhtml", + "<html><head><title>Contents" + "", + ) + archive.writestr( + "OEBPS/cover.xhtml", + "Cover

Cover

", + ) + archive.writestr( + "OEBPS/pgheader.xhtml", + "Reminiscences of a Stock Operator" + "

*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***

" + "

Produced by transcribers.

", + ) + archive.writestr( + "OEBPS/chapter1.xhtml", + "Chapter I" + "

Chapter I

" + "

I went to work when I was just out of grammar school.

", + ) + archive.writestr( + "OEBPS/chapter2.xhtml", + "Chapter II" + "

Chapter II

" + "

I was only fifteen when I made my first thousand dollars.

", + ) + archive.writestr( + "OEBPS/transcriber-notes.xhtml", + "Transcriber's Notes" + "

Transcriber's Notes

Spelling normalised.

", + ) + archive.writestr( + "OEBPS/license.xhtml", + "License" + "

License

Project Gutenberg License terms.

", + ) + archive.writestr( + "OEBPS/pgfooter.xhtml", + "End" + "

*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***

", + ) + + +def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None: + book = tmp_path / "lefevre.epub" + _write_lefevre_epub3_fixture(book) + + chunks = normalize_source(book) + + assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"] + assert [chunk.spine_index for chunk in chunks] == [3, 4] + assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) + assert all(chunk.source_type == "epub" for chunk in chunks) + # Chunk IDs must be stable, ordered, and not collapse to the book title. + assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-") + assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-") + assert chunks[0].chunk_id != chunks[1].chunk_id + + +def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None: + book = tmp_path / "lefevre.epub" + _write_lefevre_epub3_fixture(book) + + chunks = normalize_source(book) + + metadata = chunks[0].book_metadata + assert metadata["title"] == "Reminiscences of a Stock Operator" + assert metadata["creator"] == "Edwin Lefevre" + assert metadata["language"] == "en" + assert "Speculation" in metadata["subjects"] + assert "New York Stock Exchange" in metadata["subjects"] + assert metadata["rights"].startswith("Public domain") + assert metadata["identifier"] == "urn:gutenberg:60979" + assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979" + assert metadata["modified"] == "2026-05-01T00:00:00Z" + + +def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None: + book = tmp_path / "lefevre.epub" + _write_lefevre_epub3_fixture(book) + + chunks = normalize_source(book, include_non_body=True) + + by_index = {chunk.spine_index: chunk.section_role for chunk in chunks} + assert by_index[0] == SECTION_ROLE_COVER + assert by_index[1] == SECTION_ROLE_NAV + assert by_index[2] == SECTION_ROLE_HEADER + assert by_index[3] == SECTION_ROLE_BODY + assert by_index[4] == SECTION_ROLE_BODY + assert by_index[5] == SECTION_ROLE_NOTES + assert by_index[6] == SECTION_ROLE_LICENSE + assert by_index[7] == SECTION_ROLE_FOOTER + assert [chunk.spine_index for chunk in chunks] == list(range(8)) + + +def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None: + legacy = tmp_path / "legacy.epub" + with zipfile.ZipFile(legacy, "w") as archive: + archive.writestr("OEBPS/chapter1.xhtml", "

Chapter One

Alpha beta.

") + archive.writestr("OEBPS/chapter2.xhtml", "

Chapter Two

Gamma delta.

") + + chunks = normalize_source(legacy) + + assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"] + assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) + assert all(chunk.spine_index is None for chunk in chunks) + assert all(chunk.book_metadata == {} for chunk in chunks) diff --git a/workplans/IB-WP-0016-lefevre-ebook-infospace-readiness.md b/workplans/IB-WP-0016-lefevre-ebook-infospace-readiness.md index 1f6f43c..395eb0a 100644 --- a/workplans/IB-WP-0016-lefevre-ebook-infospace-readiness.md +++ b/workplans/IB-WP-0016-lefevre-ebook-infospace-readiness.md @@ -8,7 +8,7 @@ status: active owner: markitect topic_slug: markitect created: "2026-05-14" -updated: "2026-05-14" +updated: "2026-05-17" state_hub_workstream_slug: "ib-wp-0016-lefevre-ebook-infospace-readiness" state_hub_workstream_id: "23be7d20-b01f-4b17-9851-4d540e4c0984" depends_on_workplans: @@ -81,7 +81,7 @@ run should wait: ```task id: IB-WP-0016-T01 -status: in_progress +status: done priority: high state_hub_task_id: "a672fcf9-1b80-4faf-b16d-84ca52601dc9" ```