import zipfile
from pathlib import Path
from infospace_bench.source_intake import (
SECTION_ROLE_BODY,
SECTION_ROLE_COVER,
SECTION_ROLE_FOOTER,
SECTION_ROLE_HEADER,
SECTION_ROLE_LICENSE,
SECTION_ROLE_NAV,
SECTION_ROLE_NOTES,
normalize_source,
)
CONTAINER_XML = """
*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***
" "Produced by transcribers.
", ) archive.writestr( "OEBPS/chapter1.xhtml", "I went to work when I was just out of grammar school.
", ) archive.writestr( "OEBPS/chapter2.xhtml", "I was only fifteen when I made my first thousand dollars.
", ) archive.writestr( "OEBPS/transcriber-notes.xhtml", "Spelling normalised.
", ) archive.writestr( "OEBPS/license.xhtml", "Project Gutenberg License terms.
", ) archive.writestr( "OEBPS/pgfooter.xhtml", "*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***
", ) def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book) assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"] assert [chunk.spine_index for chunk in chunks] == [3, 4] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.source_type == "epub" for chunk in chunks) # Chunk IDs must be stable, ordered, and not collapse to the book title. assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-") assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-") assert chunks[0].chunk_id != chunks[1].chunk_id def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book) metadata = chunks[0].book_metadata assert metadata["title"] == "Reminiscences of a Stock Operator" assert metadata["creator"] == "Edwin Lefevre" assert metadata["language"] == "en" assert "Speculation" in metadata["subjects"] assert "New York Stock Exchange" in metadata["subjects"] assert metadata["rights"].startswith("Public domain") assert metadata["identifier"] == "urn:gutenberg:60979" assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979" assert metadata["modified"] == "2026-05-01T00:00:00Z" def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book, include_non_body=True) by_index = {chunk.spine_index: chunk.section_role for chunk in chunks} assert by_index[0] == SECTION_ROLE_COVER assert by_index[1] == SECTION_ROLE_NAV assert by_index[2] == SECTION_ROLE_HEADER assert by_index[3] == SECTION_ROLE_BODY assert by_index[4] == SECTION_ROLE_BODY assert by_index[5] == SECTION_ROLE_NOTES assert by_index[6] == SECTION_ROLE_LICENSE assert by_index[7] == SECTION_ROLE_FOOTER assert [chunk.spine_index for chunk in chunks] == list(range(8)) def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None: legacy = tmp_path / "legacy.epub" with zipfile.ZipFile(legacy, "w") as archive: archive.writestr("OEBPS/chapter1.xhtml", "Alpha beta.
") archive.writestr("OEBPS/chapter2.xhtml", "Gamma delta.
") chunks = normalize_source(legacy) assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.spine_index is None for chunk in chunks) assert all(chunk.book_metadata == {} for chunk in chunks)