epub3 inbound filter

This commit is contained in:
2026-05-14 22:46:51 +02:00
parent 8d62b2d241
commit 925b36521d
7 changed files with 971 additions and 2 deletions

206
tests/test_epub3_adapter.py Normal file
View File

@@ -0,0 +1,206 @@
from pathlib import Path
import zipfile
from markitect_tool.source import (
SourceAdapterMatchRequest,
SourceAdapterRegistry,
SourceAsset,
SourceInspectRequest,
SourceReadRequest,
discover_source_adapters,
inspect_source,
normalize_source,
)
from markitect_filter.adapters import epub3_adapter_descriptor
class FakeEntryPoint:
name = "epub3"
def load(self):
return epub3_adapter_descriptor
def test_epub3_descriptor_matches_contract():
descriptor = epub3_adapter_descriptor()
assert descriptor.id == "source.epub3"
assert descriptor.operations == ["read"]
assert descriptor.media_types == ["application/epub+zip"]
assert descriptor.extensions == [".epub"]
assert descriptor.safety["network"] is False
assert descriptor.option_schema["properties"]["skip_boilerplate"]["default"] is True
def test_epub3_adapter_matches_epub_assets(tmp_path: Path):
epub_path = _write_epub(tmp_path)
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
adapter = epub3_adapter_descriptor().instantiate()
match = adapter.can_read(SourceAdapterMatchRequest(asset=asset))
assert match.matched
assert match.confidence == 100
def test_epub3_adapter_inspects_metadata(tmp_path: Path):
epub_path = _write_epub(tmp_path)
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
adapter = epub3_adapter_descriptor().instantiate()
result = adapter.inspect(SourceInspectRequest(asset=asset))
assert result.is_valid
assert result.metadata.title == "Test Book"
assert result.metadata.creators == ["Ada Lovelace"]
assert result.metadata.language == "en"
assert result.metadata.identifiers["bookid"] == "urn:test-book"
assert result.quality.lossiness == "low"
def test_epub3_adapter_normalizes_spine_to_markdown(tmp_path: Path):
epub_path = _write_epub(tmp_path)
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
adapter = epub3_adapter_descriptor().instantiate()
result = adapter.read(SourceReadRequest(asset=asset, options={"skip_boilerplate": True}))
assert result.is_valid
assert result.document is not None
assert result.document.document_id == "source.epub3:urn:test-book"
assert result.document.metadata.title == "Test Book"
assert result.document.markdown == (
"# Opening\n\n"
"First paragraph with emphasis.\n\n"
"- First point\n\n"
"- Second point\n\n"
"## Continuation\n\n"
"Second chapter text."
)
assert [segment.segment_id for segment in result.document.segments] == [
"opening",
"continuation",
]
assert result.document.segments[0].provenance[0].package_path == "EPUB/chapter1.xhtml"
assert result.document.quality.lossiness == "none"
def test_markitect_api_can_use_epub3_registry(tmp_path: Path):
epub_path = _write_epub(tmp_path)
registry = SourceAdapterRegistry([epub3_adapter_descriptor()])
inspected = inspect_source(epub_path, registry=registry)
normalized = normalize_source(epub_path, registry=registry)
assert inspected.is_valid
assert inspected.metadata.title == "Test Book"
assert normalized.is_valid
assert normalized.document is not None
assert normalized.document.segments[1].heading == "Continuation"
def test_epub3_adapter_reports_malformed_missing_container(tmp_path: Path):
epub_path = tmp_path / "broken.epub"
with zipfile.ZipFile(epub_path, "w") as archive:
archive.writestr("mimetype", "application/epub+zip")
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
adapter = epub3_adapter_descriptor().instantiate()
result = adapter.read(SourceReadRequest(asset=asset))
assert not result.is_valid
assert result.diagnostics[0].code == "source.malformed"
assert "container.xml" in result.diagnostics[0].message
def test_epub3_entry_point_discovery_shape():
registry = discover_source_adapters([FakeEntryPoint()])
assert registry.get("source.epub3").name == "EPUB3"
def _write_epub(tmp_path: Path) -> Path:
epub_path = tmp_path / "test-book.epub"
with zipfile.ZipFile(epub_path, "w") as archive:
archive.writestr("mimetype", "application/epub+zip")
archive.writestr(
"META-INF/container.xml",
"""<?xml version="1.0" encoding="utf-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="EPUB/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
""",
)
archive.writestr(
"EPUB/content.opf",
"""<?xml version="1.0" encoding="utf-8"?>
<package version="3.0" unique-identifier="bookid" xmlns="http://www.idpf.org/2007/opf">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="bookid">urn:test-book</dc:identifier>
<dc:title>Test Book</dc:title>
<dc:creator>Ada Lovelace</dc:creator>
<dc:language>en</dc:language>
<dc:publisher>Markitect Fixtures</dc:publisher>
<dc:date>2026-05-14</dc:date>
</metadata>
<manifest>
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
<item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
<item id="chapter2" href="chapter2.xhtml" media-type="application/xhtml+xml"/>
</manifest>
<spine>
<itemref idref="chapter1"/>
<itemref idref="chapter2"/>
</spine>
</package>
""",
)
archive.writestr(
"EPUB/nav.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<nav epub:type="toc" xmlns:epub="http://www.idpf.org/2007/ops">
<ol>
<li><a href="chapter1.xhtml#opening">Opening</a></li>
<li><a href="chapter2.xhtml#continuation">Continuation</a></li>
</ol>
</nav>
</body>
</html>
""",
)
archive.writestr(
"EPUB/chapter1.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<section id="opening">
<h1>Opening</h1>
<p>First paragraph with <em>emphasis</em>.</p>
<ul>
<li>First point</li>
<li>Second point</li>
</ul>
</section>
</body>
</html>
""",
)
archive.writestr(
"EPUB/chapter2.xhtml",
"""<?xml version="1.0" encoding="utf-8"?>
<html xmlns="http://www.w3.org/1999/xhtml">
<body>
<section id="continuation">
<h2>Continuation</h2>
<p>Second chapter text.</p>
</section>
</body>
</html>
""",
)
return epub_path