Files
infospace-bench/tests/test_epub3_intake.py
tegwick 5b6a63fb7a IB-WP-0016-T01: spine-aware EPUB3 intake
Parse META-INF/container.xml and the OPF package document, then iterate
documents in spine reading order instead of archive-name sort. Classify
each spine item (body, cover, nav, toc, header, footer, notes, license,
auxiliary) and exclude non-body sections by default; include_non_body=True
opts them back in for inspection. Capture OPF book metadata (title,
creator, language, subjects, rights, identifier, source_url, modified)
onto every chunk and propagate it through source artifact provenance.
Preserve the legacy zip-without-OPF fallback for malformed EPUBs.

Real Lefevre EPUB now yields 148 body chunks in spine order (was 155
mixed, archive-sorted) with cover=1, header=1, footer=4 detected and
dropped. 78 tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 13:52:24 +02:00

174 lines
7.2 KiB
Python

import zipfile
from pathlib import Path
from infospace_bench.source_intake import (
SECTION_ROLE_BODY,
SECTION_ROLE_COVER,
SECTION_ROLE_FOOTER,
SECTION_ROLE_HEADER,
SECTION_ROLE_LICENSE,
SECTION_ROLE_NAV,
SECTION_ROLE_NOTES,
normalize_source,
)
CONTAINER_XML = """<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
"""
PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="bookid">urn:gutenberg:60979</dc:identifier>
<dc:title>Reminiscences of a Stock Operator</dc:title>
<dc:creator>Edwin Lefevre</dc:creator>
<dc:language>en</dc:language>
<dc:rights>Public domain in the USA.</dc:rights>
<dc:subject>Speculation</dc:subject>
<dc:subject>New York Stock Exchange</dc:subject>
<dc:source>https://www.gutenberg.org/ebooks/60979</dc:source>
<meta property="dcterms:modified">2026-05-01T00:00:00Z</meta>
</metadata>
<manifest>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="cover" href="cover.xhtml" media-type="application/xhtml+xml"/>
<item id="pgheader" href="pgheader.xhtml" media-type="application/xhtml+xml"/>
<item id="ch1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
<item id="ch2" href="chapter2.xhtml" media-type="application/xhtml+xml"/>
<item id="notes" href="transcriber-notes.xhtml" media-type="application/xhtml+xml"/>
<item id="license" href="license.xhtml" media-type="application/xhtml+xml"/>
<item id="pgfooter" href="pgfooter.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="cover"/>
<itemref idref="nav" linear="no"/>
<itemref idref="pgheader"/>
<itemref idref="ch1"/>
<itemref idref="ch2"/>
<itemref idref="notes"/>
<itemref idref="license"/>
<itemref idref="pgfooter"/>
</spine>
</package>
"""
def _write_lefevre_epub3_fixture(path: Path) -> None:
with zipfile.ZipFile(path, "w") as archive:
archive.writestr("mimetype", "application/epub+zip")
archive.writestr("META-INF/container.xml", CONTAINER_XML)
archive.writestr("OEBPS/content.opf", PACKAGE_OPF)
archive.writestr(
"OEBPS/nav.xhtml",
"<html><head><title>Contents</title></head>"
"<body><nav><ol><li>Chapter I</li><li>Chapter II</li></ol></nav></body></html>",
)
archive.writestr(
"OEBPS/cover.xhtml",
"<html><head><title>Cover</title></head><body><h1>Cover</h1></body></html>",
)
archive.writestr(
"OEBPS/pgheader.xhtml",
"<html><head><title>Reminiscences of a Stock Operator</title></head>"
"<body><p>*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***</p>"
"<p>Produced by transcribers.</p></body></html>",
)
archive.writestr(
"OEBPS/chapter1.xhtml",
"<html><head><title>Chapter I</title></head>"
"<body><h1>Chapter I</h1>"
"<p>I went to work when I was just out of grammar school.</p></body></html>",
)
archive.writestr(
"OEBPS/chapter2.xhtml",
"<html><head><title>Chapter II</title></head>"
"<body><h1>Chapter II</h1>"
"<p>I was only fifteen when I made my first thousand dollars.</p></body></html>",
)
archive.writestr(
"OEBPS/transcriber-notes.xhtml",
"<html><head><title>Transcriber's Notes</title></head>"
"<body><h1>Transcriber's Notes</h1><p>Spelling normalised.</p></body></html>",
)
archive.writestr(
"OEBPS/license.xhtml",
"<html><head><title>License</title></head>"
"<body><h1>License</h1><p>Project Gutenberg License terms.</p></body></html>",
)
archive.writestr(
"OEBPS/pgfooter.xhtml",
"<html><head><title>End</title></head>"
"<body><p>*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***</p></body></html>",
)
def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None:
book = tmp_path / "lefevre.epub"
_write_lefevre_epub3_fixture(book)
chunks = normalize_source(book)
assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"]
assert [chunk.spine_index for chunk in chunks] == [3, 4]
assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
assert all(chunk.source_type == "epub" for chunk in chunks)
# Chunk IDs must be stable, ordered, and not collapse to the book title.
assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-")
assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-")
assert chunks[0].chunk_id != chunks[1].chunk_id
def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None:
book = tmp_path / "lefevre.epub"
_write_lefevre_epub3_fixture(book)
chunks = normalize_source(book)
metadata = chunks[0].book_metadata
assert metadata["title"] == "Reminiscences of a Stock Operator"
assert metadata["creator"] == "Edwin Lefevre"
assert metadata["language"] == "en"
assert "Speculation" in metadata["subjects"]
assert "New York Stock Exchange" in metadata["subjects"]
assert metadata["rights"].startswith("Public domain")
assert metadata["identifier"] == "urn:gutenberg:60979"
assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979"
assert metadata["modified"] == "2026-05-01T00:00:00Z"
def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None:
book = tmp_path / "lefevre.epub"
_write_lefevre_epub3_fixture(book)
chunks = normalize_source(book, include_non_body=True)
by_index = {chunk.spine_index: chunk.section_role for chunk in chunks}
assert by_index[0] == SECTION_ROLE_COVER
assert by_index[1] == SECTION_ROLE_NAV
assert by_index[2] == SECTION_ROLE_HEADER
assert by_index[3] == SECTION_ROLE_BODY
assert by_index[4] == SECTION_ROLE_BODY
assert by_index[5] == SECTION_ROLE_NOTES
assert by_index[6] == SECTION_ROLE_LICENSE
assert by_index[7] == SECTION_ROLE_FOOTER
assert [chunk.spine_index for chunk in chunks] == list(range(8))
def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None:
legacy = tmp_path / "legacy.epub"
with zipfile.ZipFile(legacy, "w") as archive:
archive.writestr("OEBPS/chapter1.xhtml", "<h1>Chapter One</h1><p>Alpha beta.</p>")
archive.writestr("OEBPS/chapter2.xhtml", "<h1>Chapter Two</h1><p>Gamma delta.</p>")
chunks = normalize_source(legacy)
assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"]
assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
assert all(chunk.spine_index is None for chunk in chunks)
assert all(chunk.book_metadata == {} for chunk in chunks)