import zipfile from pathlib import Path from infospace_bench.source_intake import ( SECTION_ROLE_BODY, SECTION_ROLE_COVER, SECTION_ROLE_FOOTER, SECTION_ROLE_HEADER, SECTION_ROLE_LICENSE, SECTION_ROLE_NAV, SECTION_ROLE_NOTES, normalize_source, ) CONTAINER_XML = """ """ PACKAGE_OPF = """ urn:gutenberg:60979 Reminiscences of a Stock Operator Edwin Lefevre en Public domain in the USA. Speculation New York Stock Exchange https://www.gutenberg.org/ebooks/60979 2026-05-01T00:00:00Z """ def _write_lefevre_epub3_fixture(path: Path) -> None: with zipfile.ZipFile(path, "w") as archive: archive.writestr("mimetype", "application/epub+zip") archive.writestr("META-INF/container.xml", CONTAINER_XML) archive.writestr("OEBPS/content.opf", PACKAGE_OPF) archive.writestr( "OEBPS/nav.xhtml", "Contents" "", ) archive.writestr( "OEBPS/cover.xhtml", "Cover

Cover

", ) archive.writestr( "OEBPS/pgheader.xhtml", "Reminiscences of a Stock Operator" "

*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***

" "

Produced by transcribers.

", ) archive.writestr( "OEBPS/chapter1.xhtml", "Chapter I" "

Chapter I

" "

I went to work when I was just out of grammar school.

", ) archive.writestr( "OEBPS/chapter2.xhtml", "Chapter II" "

Chapter II

" "

I was only fifteen when I made my first thousand dollars.

", ) archive.writestr( "OEBPS/transcriber-notes.xhtml", "Transcriber's Notes" "

Transcriber's Notes

Spelling normalised.

", ) archive.writestr( "OEBPS/license.xhtml", "License" "

License

Project Gutenberg License terms.

", ) archive.writestr( "OEBPS/pgfooter.xhtml", "End" "

*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***

", ) def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book) assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"] assert [chunk.spine_index for chunk in chunks] == [3, 4] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.source_type == "epub" for chunk in chunks) # Stable chapter-NN IDs from in-document heading parsing, not collapsed # to the Project Gutenberg page title. assert [chunk.chunk_id for chunk in chunks] == ["chapter-01", "chapter-02"] assert [chunk.chapter_label for chunk in chunks] == ["Chapter I", "Chapter II"] assert [chunk.chapter_number for chunk in chunks] == [1, 2] assert "Chapter I" not in chunks[0].markdown.split("\n", 1)[1] def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book) metadata = chunks[0].book_metadata assert metadata["title"] == "Reminiscences of a Stock Operator" assert metadata["creator"] == "Edwin Lefevre" assert metadata["language"] == "en" assert "Speculation" in metadata["subjects"] assert "New York Stock Exchange" in metadata["subjects"] assert metadata["rights"].startswith("Public domain") assert metadata["identifier"] == "urn:gutenberg:60979" assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979" assert metadata["modified"] == "2026-05-01T00:00:00Z" def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book, include_non_body=True) by_index = {chunk.spine_index: chunk.section_role for chunk in chunks} assert by_index[0] == SECTION_ROLE_COVER assert by_index[1] == SECTION_ROLE_NAV assert by_index[2] == SECTION_ROLE_HEADER assert by_index[3] == SECTION_ROLE_BODY assert by_index[4] == SECTION_ROLE_BODY assert by_index[5] == SECTION_ROLE_NOTES assert by_index[6] == SECTION_ROLE_LICENSE assert by_index[7] == SECTION_ROLE_FOOTER assert [chunk.spine_index for chunk in chunks] == list(range(8)) def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None: legacy = tmp_path / "legacy.epub" with zipfile.ZipFile(legacy, "w") as archive: archive.writestr("OEBPS/chapter1.xhtml", "

Chapter One

Alpha beta.

") archive.writestr("OEBPS/chapter2.xhtml", "

Chapter Two

Gamma delta.

") chunks = normalize_source(legacy) assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.spine_index is None for chunk in chunks) assert all(chunk.book_metadata == {} for chunk in chunks) assert all(chunk.chapter_number is None for chunk in chunks) ROMAN_PACKAGE_OPF = """ urn:test:roman Roman Chapters Book Test Author en """ def _write_roman_chapter_epub(path: Path, *, ch1_words: int = 50) -> None: long_body = " ".join(f"word{i}" for i in range(ch1_words)) with zipfile.ZipFile(path, "w") as archive: archive.writestr("mimetype", "application/epub+zip") archive.writestr("META-INF/container.xml", CONTAINER_XML) archive.writestr("OEBPS/content.opf", ROMAN_PACKAGE_OPF) archive.writestr( "OEBPS/nav.xhtml", "TOC" "", ) archive.writestr( "OEBPS/contents.xhtml", "Book" "

Listing.

", ) archive.writestr( "OEBPS/ch1.xhtml", "Book" "

I

" "

" f'1 {long_body} ' f'2 tail tail tail.' "

", ) archive.writestr( "OEBPS/ch2.xhtml", "Book" "

II

Short chapter two.

", ) archive.writestr( "OEBPS/ch3.xhtml", "Book" "

III

Short chapter three.

", ) def test_epub3_intake_assigns_stable_chapter_ids_from_roman_headings(tmp_path: Path) -> None: book = tmp_path / "roman.epub" _write_roman_chapter_epub(book) chunks = normalize_source(book) # nav says ch1="Foreword" so chapter_number stays None there (non-numeric label). # ch2/ch3 nav says II/III which match the in-document heading and parse to 2/3. # contents.xhtml has heading "Contents" which reclassifies as toc (non-body) and is dropped by default. assert [chunk.chapter_label for chunk in chunks] == ["Foreword", "II", "III"] assert [chunk.chapter_number for chunk in chunks] == [None, 2, 3] assert [chunk.chunk_id for chunk in chunks] == [ "chapter-foreword", "chapter-02", "chapter-03", ] def test_epub3_intake_reclassifies_contents_body_section_when_included(tmp_path: Path) -> None: book = tmp_path / "roman.epub" _write_roman_chapter_epub(book) chunks = normalize_source(book, include_non_body=True) contents = next(chunk for chunk in chunks if chunk.spine_index == 1) assert contents.section_role == "toc" assert contents.chapter_label == "Contents" def test_epub3_intake_splits_long_chapter_into_parts_with_anchors(tmp_path: Path) -> None: book = tmp_path / "roman.epub" _write_roman_chapter_epub(book, ch1_words=300) chunks = normalize_source(book, max_words=120) foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"] assert len(foreword_parts) >= 2 assert [chunk.chunk_id for chunk in foreword_parts] == [ f"chapter-foreword-part-{i + 1:03d}" for i in range(len(foreword_parts)) ] # Each part keeps the chapter heading and is named by chapter + part suffix. assert all(chunk.markdown.startswith("# Foreword\n") for chunk in foreword_parts) assert all(chunk.title.startswith("Foreword") for chunk in foreword_parts) # Page anchors land on the parts whose word range contains them. first_part_anchors = foreword_parts[0].page_anchors last_part_anchors = foreword_parts[-1].page_anchors assert "Page_1" in first_part_anchors assert "Page_2" in last_part_anchors # Anchor markers must not leak into the final markdown text. assert "⟦anchor:" not in foreword_parts[0].markdown def test_epub3_intake_supports_word_overlap_between_chapter_parts(tmp_path: Path) -> None: book = tmp_path / "roman.epub" _write_roman_chapter_epub(book, ch1_words=200) chunks = normalize_source(book, max_words=80, overlap_words=20) foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"] assert len(foreword_parts) >= 2 def _body_words(markdown: str) -> list[str]: body = markdown.split("\n", 2)[2] if markdown.count("\n") >= 2 else "" return body.split() first = _body_words(foreword_parts[0].markdown) second = _body_words(foreword_parts[1].markdown) # The trailing overlap_words of the first part must reappear verbatim at # the head of the next part. assert first[-20:] == second[:20]