import zipfile from pathlib import Path from infospace_bench.source_intake import ( SECTION_ROLE_BODY, SECTION_ROLE_COVER, SECTION_ROLE_FOOTER, SECTION_ROLE_HEADER, SECTION_ROLE_LICENSE, SECTION_ROLE_NAV, SECTION_ROLE_NOTES, normalize_source, ) CONTAINER_XML = """ """ PACKAGE_OPF = """ urn:gutenberg:60979 Reminiscences of a Stock Operator Edwin Lefevre en Public domain in the USA. Speculation New York Stock Exchange https://www.gutenberg.org/ebooks/60979 2026-05-01T00:00:00Z """ def _write_lefevre_epub3_fixture(path: Path) -> None: with zipfile.ZipFile(path, "w") as archive: archive.writestr("mimetype", "application/epub+zip") archive.writestr("META-INF/container.xml", CONTAINER_XML) archive.writestr("OEBPS/content.opf", PACKAGE_OPF) archive.writestr( "OEBPS/nav.xhtml", "Contents" "", ) archive.writestr( "OEBPS/cover.xhtml", "Cover

Cover

", ) archive.writestr( "OEBPS/pgheader.xhtml", "Reminiscences of a Stock Operator" "

*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***

" "

Produced by transcribers.

", ) archive.writestr( "OEBPS/chapter1.xhtml", "Chapter I" "

Chapter I

" "

I went to work when I was just out of grammar school.

", ) archive.writestr( "OEBPS/chapter2.xhtml", "Chapter II" "

Chapter II

" "

I was only fifteen when I made my first thousand dollars.

", ) archive.writestr( "OEBPS/transcriber-notes.xhtml", "Transcriber's Notes" "

Transcriber's Notes

Spelling normalised.

", ) archive.writestr( "OEBPS/license.xhtml", "License" "

License

Project Gutenberg License terms.

", ) archive.writestr( "OEBPS/pgfooter.xhtml", "End" "

*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***

", ) def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book) assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"] assert [chunk.spine_index for chunk in chunks] == [3, 4] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.source_type == "epub" for chunk in chunks) # Chunk IDs must be stable, ordered, and not collapse to the book title. assert chunks[0].chunk_id.startswith("reminiscences-of-a-stock-operator-004-") assert chunks[1].chunk_id.startswith("reminiscences-of-a-stock-operator-005-") assert chunks[0].chunk_id != chunks[1].chunk_id def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book) metadata = chunks[0].book_metadata assert metadata["title"] == "Reminiscences of a Stock Operator" assert metadata["creator"] == "Edwin Lefevre" assert metadata["language"] == "en" assert "Speculation" in metadata["subjects"] assert "New York Stock Exchange" in metadata["subjects"] assert metadata["rights"].startswith("Public domain") assert metadata["identifier"] == "urn:gutenberg:60979" assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979" assert metadata["modified"] == "2026-05-01T00:00:00Z" def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book, include_non_body=True) by_index = {chunk.spine_index: chunk.section_role for chunk in chunks} assert by_index[0] == SECTION_ROLE_COVER assert by_index[1] == SECTION_ROLE_NAV assert by_index[2] == SECTION_ROLE_HEADER assert by_index[3] == SECTION_ROLE_BODY assert by_index[4] == SECTION_ROLE_BODY assert by_index[5] == SECTION_ROLE_NOTES assert by_index[6] == SECTION_ROLE_LICENSE assert by_index[7] == SECTION_ROLE_FOOTER assert [chunk.spine_index for chunk in chunks] == list(range(8)) def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None: legacy = tmp_path / "legacy.epub" with zipfile.ZipFile(legacy, "w") as archive: archive.writestr("OEBPS/chapter1.xhtml", "

Chapter One

Alpha beta.

") archive.writestr("OEBPS/chapter2.xhtml", "

Chapter Two

Gamma delta.

") chunks = normalize_source(legacy) assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.spine_index is None for chunk in chunks) assert all(chunk.book_metadata == {} for chunk in chunks)