infospace-bench/tests/test_epub3_intake.py

import zipfile
from pathlib import Path

from infospace_bench.source_intake import (
    SECTION_ROLE_BODY,
    SECTION_ROLE_COVER,
    SECTION_ROLE_FOOTER,
    SECTION_ROLE_HEADER,
    SECTION_ROLE_LICENSE,
    SECTION_ROLE_NAV,
    SECTION_ROLE_NOTES,
    normalize_source,
)


CONTAINER_XML = """<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
  <rootfiles>
    <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
  </rootfiles>
</container>
"""

PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
    <dc:identifier id="bookid">urn:gutenberg:60979</dc:identifier>
    <dc:title>Reminiscences of a Stock Operator</dc:title>
    <dc:creator>Edwin Lefevre</dc:creator>
    <dc:language>en</dc:language>
    <dc:rights>Public domain in the USA.</dc:rights>
    <dc:subject>Speculation</dc:subject>
    <dc:subject>New York Stock Exchange</dc:subject>
    <dc:source>https://www.gutenberg.org/ebooks/60979</dc:source>
    <meta property="dcterms:modified">2026-05-01T00:00:00Z</meta>
  </metadata>
  <manifest>
    <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
    <item id="cover" href="cover.xhtml" media-type="application/xhtml+xml"/>
    <item id="pgheader" href="pgheader.xhtml" media-type="application/xhtml+xml"/>
    <item id="ch1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
    <item id="ch2" href="chapter2.xhtml" media-type="application/xhtml+xml"/>
    <item id="notes" href="transcriber-notes.xhtml" media-type="application/xhtml+xml"/>
    <item id="license" href="license.xhtml" media-type="application/xhtml+xml"/>
    <item id="pgfooter" href="pgfooter.xhtml" media-type="application/xhtml+xml"/>
  </manifest>
  <spine>
    <itemref idref="cover"/>
    <itemref idref="nav" linear="no"/>
    <itemref idref="pgheader"/>
    <itemref idref="ch1"/>
    <itemref idref="ch2"/>
    <itemref idref="notes"/>
    <itemref idref="license"/>
    <itemref idref="pgfooter"/>
  </spine>
</package>
"""


def _write_lefevre_epub3_fixture(path: Path) -> None:
    with zipfile.ZipFile(path, "w") as archive:
        archive.writestr("mimetype", "application/epub+zip")
        archive.writestr("META-INF/container.xml", CONTAINER_XML)
        archive.writestr("OEBPS/content.opf", PACKAGE_OPF)
        archive.writestr(
            "OEBPS/nav.xhtml",
            "<html><head><title>Contents</title></head>"
            "<body><nav><ol><li>Chapter I</li><li>Chapter II</li></ol></nav></body></html>",
        )
        archive.writestr(
            "OEBPS/cover.xhtml",
            "<html><head><title>Cover</title></head><body><h1>Cover</h1></body></html>",
        )
        archive.writestr(
            "OEBPS/pgheader.xhtml",
            "<html><head><title>Reminiscences of a Stock Operator</title></head>"
            "<body><p>*** START OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***</p>"
            "<p>Produced by transcribers.</p></body></html>",
        )
        archive.writestr(
            "OEBPS/chapter1.xhtml",
            "<html><head><title>Chapter I</title></head>"
            "<body><h1>Chapter I</h1>"
            "<p>I went to work when I was just out of grammar school.</p></body></html>",
        )
        archive.writestr(
            "OEBPS/chapter2.xhtml",
            "<html><head><title>Chapter II</title></head>"
            "<body><h1>Chapter II</h1>"
            "<p>I was only fifteen when I made my first thousand dollars.</p></body></html>",
        )
        archive.writestr(
            "OEBPS/transcriber-notes.xhtml",
            "<html><head><title>Transcriber's Notes</title></head>"
            "<body><h1>Transcriber's Notes</h1><p>Spelling normalised.</p></body></html>",
        )
        archive.writestr(
            "OEBPS/license.xhtml",
            "<html><head><title>License</title></head>"
            "<body><h1>License</h1><p>Project Gutenberg License terms.</p></body></html>",
        )
        archive.writestr(
            "OEBPS/pgfooter.xhtml",
            "<html><head><title>End</title></head>"
            "<body><p>*** END OF THE PROJECT GUTENBERG EBOOK REMINISCENCES ***</p></body></html>",
        )


def test_epub3_intake_follows_spine_order_and_drops_non_body_by_default(tmp_path: Path) -> None:
    book = tmp_path / "lefevre.epub"
    _write_lefevre_epub3_fixture(book)

    chunks = normalize_source(book)

    assert [chunk.title for chunk in chunks] == ["Chapter I", "Chapter II"]
    assert [chunk.spine_index for chunk in chunks] == [3, 4]
    assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
    assert all(chunk.source_type == "epub" for chunk in chunks)
    # Stable chapter-NN IDs from in-document heading parsing, not collapsed
    # to the Project Gutenberg page title.
    assert [chunk.chunk_id for chunk in chunks] == ["chapter-01", "chapter-02"]
    assert [chunk.chapter_label for chunk in chunks] == ["Chapter I", "Chapter II"]
    assert [chunk.chapter_number for chunk in chunks] == [1, 2]
    assert "Chapter I" not in chunks[0].markdown.split("\n", 1)[1]


def test_epub3_intake_captures_book_metadata_provenance(tmp_path: Path) -> None:
    book = tmp_path / "lefevre.epub"
    _write_lefevre_epub3_fixture(book)

    chunks = normalize_source(book)

    metadata = chunks[0].book_metadata
    assert metadata["title"] == "Reminiscences of a Stock Operator"
    assert metadata["creator"] == "Edwin Lefevre"
    assert metadata["language"] == "en"
    assert "Speculation" in metadata["subjects"]
    assert "New York Stock Exchange" in metadata["subjects"]
    assert metadata["rights"].startswith("Public domain")
    assert metadata["identifier"] == "urn:gutenberg:60979"
    assert metadata["source_url"] == "https://www.gutenberg.org/ebooks/60979"
    assert metadata["modified"] == "2026-05-01T00:00:00Z"


def test_epub3_intake_tags_non_body_sections_when_included(tmp_path: Path) -> None:
    book = tmp_path / "lefevre.epub"
    _write_lefevre_epub3_fixture(book)

    chunks = normalize_source(book, include_non_body=True)

    by_index = {chunk.spine_index: chunk.section_role for chunk in chunks}
    assert by_index[0] == SECTION_ROLE_COVER
    assert by_index[1] == SECTION_ROLE_NAV
    assert by_index[2] == SECTION_ROLE_HEADER
    assert by_index[3] == SECTION_ROLE_BODY
    assert by_index[4] == SECTION_ROLE_BODY
    assert by_index[5] == SECTION_ROLE_NOTES
    assert by_index[6] == SECTION_ROLE_LICENSE
    assert by_index[7] == SECTION_ROLE_FOOTER
    assert [chunk.spine_index for chunk in chunks] == list(range(8))


def test_epub3_intake_falls_back_to_archive_order_for_malformed_epub(tmp_path: Path) -> None:
    legacy = tmp_path / "legacy.epub"
    with zipfile.ZipFile(legacy, "w") as archive:
        archive.writestr("OEBPS/chapter1.xhtml", "<h1>Chapter One</h1><p>Alpha beta.</p>")
        archive.writestr("OEBPS/chapter2.xhtml", "<h1>Chapter Two</h1><p>Gamma delta.</p>")

    chunks = normalize_source(legacy)

    assert [chunk.title for chunk in chunks] == ["Chapter One", "Chapter Two"]
    assert all(chunk.section_role == SECTION_ROLE_BODY for chunk in chunks)
    assert all(chunk.spine_index is None for chunk in chunks)
    assert all(chunk.book_metadata == {} for chunk in chunks)
    assert all(chunk.chapter_number is None for chunk in chunks)


ROMAN_PACKAGE_OPF = """<?xml version="1.0" encoding="utf-8"?>
<package xmlns="http://www.idpf.org/2007/opf" version="3.0" unique-identifier="bookid">
  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
    <dc:identifier id="bookid">urn:test:roman</dc:identifier>
    <dc:title>Roman Chapters Book</dc:title>
    <dc:creator>Test Author</dc:creator>
    <dc:language>en</dc:language>
  </metadata>
  <manifest>
    <item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
    <item id="contents" href="contents.xhtml" media-type="application/xhtml+xml"/>
    <item id="ch1" href="ch1.xhtml" media-type="application/xhtml+xml"/>
    <item id="ch2" href="ch2.xhtml" media-type="application/xhtml+xml"/>
    <item id="ch3" href="ch3.xhtml" media-type="application/xhtml+xml"/>
  </manifest>
  <spine>
    <itemref idref="nav" linear="no"/>
    <itemref idref="contents"/>
    <itemref idref="ch1"/>
    <itemref idref="ch2"/>
    <itemref idref="ch3"/>
  </spine>
</package>
"""


def _write_roman_chapter_epub(path: Path, *, ch1_words: int = 50) -> None:
    long_body = " ".join(f"word{i}" for i in range(ch1_words))
    with zipfile.ZipFile(path, "w") as archive:
        archive.writestr("mimetype", "application/epub+zip")
        archive.writestr("META-INF/container.xml", CONTAINER_XML)
        archive.writestr("OEBPS/content.opf", ROMAN_PACKAGE_OPF)
        archive.writestr(
            "OEBPS/nav.xhtml",
            "<html><head><title>TOC</title></head><body>"
            "<nav epub:type='toc'><ol>"
            "<li><a href='contents.xhtml'>Contents</a></li>"
            "<li><a href='ch1.xhtml'>Foreword</a></li>"
            "<li><a href='ch2.xhtml'>II</a></li>"
            "<li><a href='ch3.xhtml'>III</a></li>"
            "</ol></nav></body></html>",
        )
        archive.writestr(
            "OEBPS/contents.xhtml",
            "<html><head><title>Book</title></head>"
            "<body><h2>Contents</h2><p>Listing.</p></body></html>",
        )
        archive.writestr(
            "OEBPS/ch1.xhtml",
            "<html><head><title>Book</title></head>"
            "<body><h2>I</h2>"
            "<p>"
            f'<span id="Page_1">1</span> {long_body} '
            f'<span id="Page_2">2</span> tail tail tail.'
            "</p></body></html>",
        )
        archive.writestr(
            "OEBPS/ch2.xhtml",
            "<html><head><title>Book</title></head>"
            "<body><h2>II</h2><p>Short chapter two.</p></body></html>",
        )
        archive.writestr(
            "OEBPS/ch3.xhtml",
            "<html><head><title>Book</title></head>"
            "<body><h2>III</h2><p>Short chapter three.</p></body></html>",
        )


def test_epub3_intake_assigns_stable_chapter_ids_from_roman_headings(tmp_path: Path) -> None:
    book = tmp_path / "roman.epub"
    _write_roman_chapter_epub(book)

    chunks = normalize_source(book)

    # nav says ch1="Foreword" so chapter_number stays None there (non-numeric label).
    # ch2/ch3 nav says II/III which match the in-document heading and parse to 2/3.
    # contents.xhtml has heading "Contents" which reclassifies as toc (non-body) and is dropped by default.
    assert [chunk.chapter_label for chunk in chunks] == ["Foreword", "II", "III"]
    assert [chunk.chapter_number for chunk in chunks] == [None, 2, 3]
    assert [chunk.chunk_id for chunk in chunks] == [
        "chapter-foreword",
        "chapter-02",
        "chapter-03",
    ]


def test_epub3_intake_reclassifies_contents_body_section_when_included(tmp_path: Path) -> None:
    book = tmp_path / "roman.epub"
    _write_roman_chapter_epub(book)

    chunks = normalize_source(book, include_non_body=True)

    contents = next(chunk for chunk in chunks if chunk.spine_index == 1)
    assert contents.section_role == "toc"
    assert contents.chapter_label == "Contents"


def test_epub3_intake_splits_long_chapter_into_parts_with_anchors(tmp_path: Path) -> None:
    book = tmp_path / "roman.epub"
    _write_roman_chapter_epub(book, ch1_words=300)

    chunks = normalize_source(book, max_words=120)

    foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"]
    assert len(foreword_parts) >= 2
    assert [chunk.chunk_id for chunk in foreword_parts] == [
        f"chapter-foreword-part-{i + 1:03d}" for i in range(len(foreword_parts))
    ]
    # Each part keeps the chapter heading and is named by chapter + part suffix.
    assert all(chunk.markdown.startswith("# Foreword\n") for chunk in foreword_parts)
    assert all(chunk.title.startswith("Foreword") for chunk in foreword_parts)

    # Page anchors land on the parts whose word range contains them.
    first_part_anchors = foreword_parts[0].page_anchors
    last_part_anchors = foreword_parts[-1].page_anchors
    assert "Page_1" in first_part_anchors
    assert "Page_2" in last_part_anchors
    # Anchor markers must not leak into the final markdown text.
    assert "⟦anchor:" not in foreword_parts[0].markdown


def test_epub3_intake_supports_word_overlap_between_chapter_parts(tmp_path: Path) -> None:
    book = tmp_path / "roman.epub"
    _write_roman_chapter_epub(book, ch1_words=200)

    chunks = normalize_source(book, max_words=80, overlap_words=20)

    foreword_parts = [chunk for chunk in chunks if chunk.chapter_label == "Foreword"]
    assert len(foreword_parts) >= 2

    def _body_words(markdown: str) -> list[str]:
        body = markdown.split("\n", 2)[2] if markdown.count("\n") >= 2 else ""
        return body.split()

    first = _body_words(foreword_parts[0].markdown)
    second = _body_words(foreword_parts[1].markdown)
    # The trailing overlap_words of the first part must reappear verbatim at
    # the head of the next part.
    assert first[-20:] == second[:20]