import zipfile
from pathlib import Path
from infospace_bench.source_intake import (
SECTION_ROLE_BODY,
SECTION_ROLE_COVER,
SECTION_ROLE_FOOTER,
SECTION_ROLE_HEADER,
SECTION_ROLE_LICENSE,
SECTION_ROLE_NAV,
SECTION_ROLE_NOTES,
normalize_source,
)
CONTAINER_XML = """
*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***
" "Produced by transcribers.
", ) archive.writestr( "OEBPS/chapter1.xhtml", "I went to work when I was just out of grammar school.
", ) archive.writestr( "OEBPS/chapter2.xhtml", "I was only fifteen when I made my first thousand dollars.
", ) archive.writestr( "OEBPS/transcriber-notes.xhtml", "Spelling normalised.
", ) archive.writestr( "OEBPS/license.xhtml", "Project Gutenberg License terms.
", ) archive.writestr( "OEBPS/pgfooter.xhtml", "*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***
", ) def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book) assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"] assert [chunk.spine_index for chunk in chunks] == [3, 4] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.source_type == "epub" for chunk in chunks) # Stable chapter-NN IDs from in-document heading parsing, not collapsed # to the Project Gutenberg page title. assert [chunk.chunk_id for chunk in chunks] == ["chapter-01", "chapter-02"] assert [chunk.chapter_label for chunk in chunks] == ["Chapter I", "Chapter II"] assert [chunk.chapter_number for chunk in chunks] == [1, 2] assert "Chapter I" not in chunks[0].markdown.split("\n", 1)[1] def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book) metadata = chunks[0].book_metadata assert metadata["title"] == "Reminiscences of a Stock Operator" assert metadata["creator"] == "Edwin Lefevre" assert metadata["language"] == "en" assert "Speculation" in metadata["subjects"] assert "New York Stock Exchange" in metadata["subjects"] assert metadata["rights"].startswith("Public domain") assert metadata["identifier"] == "urn:gutenberg:60979" assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979" assert metadata["modified"] == "2026-05-01T00:00:00Z" def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None: book = tmp_path / "lefevre.epub" _write_lefevre_epub3_fixture(book) chunks = normalize_source(book, include_non_body=True) by_index = {chunk.spine_index: chunk.section_role for chunk in chunks} assert by_index[0] == SECTION_ROLE_COVER assert by_index[1] == SECTION_ROLE_NAV assert by_index[2] == SECTION_ROLE_HEADER assert by_index[3] == SECTION_ROLE_BODY assert by_index[4] == SECTION_ROLE_BODY assert by_index[5] == SECTION_ROLE_NOTES assert by_index[6] == SECTION_ROLE_LICENSE assert by_index[7] == SECTION_ROLE_FOOTER assert [chunk.spine_index for chunk in chunks] == list(range(8)) def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None: legacy = tmp_path / "legacy.epub" with zipfile.ZipFile(legacy, "w") as archive: archive.writestr("OEBPS/chapter1.xhtml", "Alpha beta.
") archive.writestr("OEBPS/chapter2.xhtml", "Gamma delta.
") chunks = normalize_source(legacy) assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"] assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks) assert all(chunk.spine_index is None for chunk in chunks) assert all(chunk.book_metadata == {} for chunk in chunks) assert all(chunk.chapter_number is None for chunk in chunks) ROMAN_PACKAGE_OPF = """Listing.
", ) archive.writestr( "OEBPS/ch1.xhtml", "" f'1 {long_body} ' f'2 tail tail tail.' "
", ) archive.writestr( "OEBPS/ch2.xhtml", "Short chapter two.
", ) archive.writestr( "OEBPS/ch3.xhtml", "Short chapter three.
", ) def test_epub3_intake_assigns_stable_chapter_ids_from_roman_headings(tmp_path: Path) -> None: book = tmp_path / "roman.epub" _write_roman_chapter_epub(book) chunks = normalize_source(book) # nav says ch1="Foreword" so chapter_number stays None there (non-numeric label). # ch2/ch3 nav says II/III which match the in-document heading and parse to 2/3. # contents.xhtml has heading "Contents" which reclassifies as toc (non-body) and is dropped by default. assert [chunk.chapter_label for chunk in chunks] == ["Foreword", "II", "III"] assert [chunk.chapter_number for chunk in chunks] == [None, 2, 3] assert [chunk.chunk_id for chunk in chunks] == [ "chapter-foreword", "chapter-02", "chapter-03", ] def test_epub3_intake_reclassifies_contents_body_section_when_included(tmp_path: Path) -> None: book = tmp_path / "roman.epub" _write_roman_chapter_epub(book) chunks = normalize_source(book, include_non_body=True) contents = next(chunk for chunk in chunks if chunk.spine_index == 1) assert contents.section_role == "toc" assert contents.chapter_label == "Contents" def test_epub3_intake_splits_long_chapter_into_parts_with_anchors(tmp_path: Path) -> None: book = tmp_path / "roman.epub" _write_roman_chapter_epub(book, ch1_words=300) chunks = normalize_source(book, max_words=120) foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"] assert len(foreword_parts) >= 2 assert [chunk.chunk_id for chunk in foreword_parts] == [ f"chapter-foreword-part-{i + 1:03d}" for i in range(len(foreword_parts)) ] # Each part keeps the chapter heading and is named by chapter + part suffix. assert all(chunk.markdown.startswith("# Foreword\n") for chunk in foreword_parts) assert all(chunk.title.startswith("Foreword") for chunk in foreword_parts) # Page anchors land on the parts whose word range contains them. first_part_anchors = foreword_parts[0].page_anchors last_part_anchors = foreword_parts[-1].page_anchors assert "Page_1" in first_part_anchors assert "Page_2" in last_part_anchors # Anchor markers must not leak into the final markdown text. assert "⟦anchor:" not in foreword_parts[0].markdown def test_epub3_intake_supports_word_overlap_between_chapter_parts(tmp_path: Path) -> None: book = tmp_path / "roman.epub" _write_roman_chapter_epub(book, ch1_words=200) chunks = normalize_source(book, max_words=80, overlap_words=20) foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"] assert len(foreword_parts) >= 2 def _body_words(markdown: str) -> list[str]: body = markdown.split("\n", 2)[2] if markdown.count("\n") >= 2 else "" return body.split() first = _body_words(foreword_parts[0].markdown) second = _body_words(foreword_parts[1].markdown) # The trailing overlap_words of the first part must reappear verbatim at # the head of the next part. assert first[-20:] == second[:20]