generated from coulomb/repo-seed
Add source attachment metadata compatibility
This commit is contained in:
@@ -31,6 +31,8 @@ def test_epub3_descriptor_matches_contract():
|
||||
assert descriptor.extensions == [".epub"]
|
||||
assert descriptor.safety["network"] is False
|
||||
assert descriptor.option_schema["properties"]["skip_boilerplate"]["default"] is True
|
||||
assert descriptor.quality_profile["attachments"] == "read-side-source-assets"
|
||||
assert descriptor.metadata["render_asset_manifest_compatible"] is True
|
||||
|
||||
|
||||
def test_epub3_adapter_matches_epub_assets(tmp_path: Path):
|
||||
@@ -57,6 +59,7 @@ def test_epub3_adapter_inspects_metadata(tmp_path: Path):
|
||||
assert result.metadata.language == "en"
|
||||
assert result.metadata.identifiers["bookid"] == "urn:test-book"
|
||||
assert result.quality.lossiness == "low"
|
||||
assert result.quality.metadata["attachment_candidates"] == 2
|
||||
|
||||
|
||||
def test_epub3_adapter_normalizes_spine_to_markdown(tmp_path: Path):
|
||||
@@ -83,6 +86,16 @@ def test_epub3_adapter_normalizes_spine_to_markdown(tmp_path: Path):
|
||||
"continuation",
|
||||
]
|
||||
assert result.document.segments[0].provenance[0].package_path == "EPUB/chapter1.xhtml"
|
||||
assert [attachment.metadata["source_role"] for attachment in result.document.attachments] == [
|
||||
"image",
|
||||
"stylesheet",
|
||||
]
|
||||
assert result.document.attachments[0].media_type == "image/png"
|
||||
assert result.document.attachments[1].media_type == "text/css"
|
||||
assert result.document.attachments[0].digest.startswith("sha256:")
|
||||
assert result.document.attachments[0].metadata["package_path"] == "EPUB/images/chart.png"
|
||||
assert result.document.attachments[0].metadata["render_manifest_compatible"] is True
|
||||
assert result.document.quality.metadata["attachment_count"] == 2
|
||||
assert result.document.quality.lossiness == "none"
|
||||
|
||||
|
||||
@@ -114,13 +127,28 @@ def test_epub3_adapter_reports_malformed_missing_container(tmp_path: Path):
|
||||
assert "container.xml" in result.diagnostics[0].message
|
||||
|
||||
|
||||
def test_epub3_adapter_reports_unsupported_package_resources(tmp_path: Path):
|
||||
epub_path = _write_epub(tmp_path, include_unsupported_resource=True)
|
||||
asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
|
||||
adapter = epub3_adapter_descriptor().instantiate()
|
||||
|
||||
result = adapter.read(SourceReadRequest(asset=asset))
|
||||
|
||||
assert result.is_valid
|
||||
assert result.document is not None
|
||||
assert any(
|
||||
diagnostic.code == "source.epub3.skipped_resource"
|
||||
for diagnostic in result.document.diagnostics
|
||||
)
|
||||
|
||||
|
||||
def test_epub3_entry_point_discovery_shape():
|
||||
registry = discover_source_adapters([FakeEntryPoint()])
|
||||
|
||||
assert registry.get("source.epub3").name == "EPUB3"
|
||||
|
||||
|
||||
def _write_epub(tmp_path: Path) -> Path:
|
||||
def _write_epub(tmp_path: Path, *, include_unsupported_resource: bool = False) -> Path:
|
||||
epub_path = tmp_path / "test-book.epub"
|
||||
with zipfile.ZipFile(epub_path, "w") as archive:
|
||||
archive.writestr("mimetype", "application/epub+zip")
|
||||
@@ -150,13 +178,22 @@ def _write_epub(tmp_path: Path) -> Path:
|
||||
<item id="nav" href="nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
|
||||
<item id="chapter1" href="chapter1.xhtml" media-type="application/xhtml+xml"/>
|
||||
<item id="chapter2" href="chapter2.xhtml" media-type="application/xhtml+xml"/>
|
||||
<item id="style" href="styles/book.css" media-type="text/css"/>
|
||||
<item id="chart" href="images/chart.png" media-type="image/png"/>
|
||||
{unsupported}
|
||||
</manifest>
|
||||
<spine>
|
||||
<itemref idref="chapter1"/>
|
||||
<itemref idref="chapter2"/>
|
||||
</spine>
|
||||
</package>
|
||||
""",
|
||||
""".format(
|
||||
unsupported=(
|
||||
'<item id="payload" href="data/payload.bin" media-type="application/x-custom-binary"/>'
|
||||
if include_unsupported_resource
|
||||
else ""
|
||||
)
|
||||
),
|
||||
)
|
||||
archive.writestr(
|
||||
"EPUB/nav.xhtml",
|
||||
@@ -203,4 +240,8 @@ def _write_epub(tmp_path: Path) -> Path:
|
||||
</html>
|
||||
""",
|
||||
)
|
||||
archive.writestr("EPUB/styles/book.css", "body { color: #111; }\n")
|
||||
archive.writestr("EPUB/images/chart.png", b"\x89PNG\r\n\x1a\nfixture")
|
||||
if include_unsupported_resource:
|
||||
archive.writestr("EPUB/data/payload.bin", b"custom")
|
||||
return epub_path
|
||||
|
||||
@@ -32,6 +32,9 @@ def test_pdf_descriptor_matches_contract():
|
||||
assert descriptor.safety["external_process"] is False
|
||||
assert descriptor.option_schema["properties"]["include_page_breaks"]["default"] is False
|
||||
assert descriptor.metadata["dependency_profile"] == "stdlib"
|
||||
assert descriptor.metadata["render_asset_manifest_compatible"] is True
|
||||
assert descriptor.quality_profile["attachments"] == "metadata-with-digest"
|
||||
assert descriptor.quality_profile["images"] == "signal-only"
|
||||
|
||||
|
||||
def test_pdf_adapter_matches_pdf_assets(tmp_path: Path):
|
||||
@@ -60,6 +63,7 @@ def test_pdf_adapter_inspects_metadata(tmp_path: Path):
|
||||
assert result.quality.lossiness == "medium"
|
||||
assert result.quality.metadata["page_count"] == 2
|
||||
assert result.quality.metadata["pages_with_text"] == 2
|
||||
assert result.quality.metadata["attachment_count"] == 0
|
||||
|
||||
|
||||
def test_pdf_adapter_normalizes_pages_to_markdown(tmp_path: Path):
|
||||
@@ -80,6 +84,7 @@ def test_pdf_adapter_normalizes_pages_to_markdown(tmp_path: Path):
|
||||
assert result.document.segments[0].provenance[0].page == "1"
|
||||
assert result.document.quality.lossiness == "low"
|
||||
assert result.document.quality.metadata["page_coverage"] == 1.0
|
||||
assert result.document.attachments == []
|
||||
|
||||
|
||||
def test_pdf_adapter_applies_page_range_and_page_markers(tmp_path: Path):
|
||||
@@ -137,17 +142,57 @@ def test_pdf_adapter_reports_encrypted_pdf(tmp_path: Path):
|
||||
assert result.diagnostics[0].code == "source.pdf.encrypted"
|
||||
|
||||
|
||||
def test_pdf_adapter_reports_embedded_files_and_image_signals(tmp_path: Path):
|
||||
pdf_path = _write_pdf(tmp_path, embedded_file=True, image_signal=True)
|
||||
asset = SourceAsset.from_path(pdf_path, media_type="application/pdf")
|
||||
adapter = pdf_adapter_descriptor().instantiate()
|
||||
|
||||
result = adapter.read(SourceReadRequest(asset=asset))
|
||||
|
||||
assert result.is_valid
|
||||
assert result.document is not None
|
||||
assert [attachment.metadata["source_role"] for attachment in result.document.attachments] == [
|
||||
"embedded-file",
|
||||
"image-signal",
|
||||
]
|
||||
embedded = result.document.attachments[0]
|
||||
signal = result.document.attachments[1]
|
||||
assert embedded.name == "attachment.txt"
|
||||
assert embedded.media_type == "text/plain"
|
||||
assert embedded.digest.startswith("sha256:")
|
||||
assert embedded.metadata["render_manifest_compatible"] is True
|
||||
assert signal.media_type == "application/x.markitect-pdf-image-signal"
|
||||
assert signal.metadata["page"] == 1
|
||||
assert signal.metadata["signal_only"] is True
|
||||
assert result.document.quality.metadata["attachment_count"] == 2
|
||||
assert result.document.quality.metadata["embedded_file_count"] == 1
|
||||
assert result.document.quality.metadata["image_signal_count"] == 1
|
||||
assert any(
|
||||
diagnostic.code == "source.pdf.image_resource_signal"
|
||||
for diagnostic in result.document.diagnostics
|
||||
)
|
||||
|
||||
|
||||
def test_pdf_entry_point_discovery_shape():
|
||||
registry = discover_source_adapters([FakeEntryPoint()])
|
||||
|
||||
assert registry.get("source.pdf").name == "PDF"
|
||||
|
||||
|
||||
def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path:
|
||||
def _write_pdf(
|
||||
tmp_path: Path,
|
||||
*,
|
||||
encrypted: bool = False,
|
||||
embedded_file: bool = False,
|
||||
image_signal: bool = False,
|
||||
) -> Path:
|
||||
pdf_path = tmp_path / ("encrypted.pdf" if encrypted else "fixture.pdf")
|
||||
objects: list[tuple[int, bytes]] = []
|
||||
page_refs = []
|
||||
next_id = 3
|
||||
font_id = 100
|
||||
info_id = 101
|
||||
encrypt_id = 102
|
||||
for page_number, lines in enumerate(
|
||||
[
|
||||
["Hello PDF", "Second line"],
|
||||
@@ -159,13 +204,20 @@ def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path:
|
||||
content_id = next_id + 1
|
||||
next_id += 2
|
||||
page_refs.append(f"{page_id} 0 R")
|
||||
stream = _page_stream(lines)
|
||||
include_image = image_signal and page_number == 1
|
||||
image_id = None
|
||||
if include_image:
|
||||
image_id = next_id
|
||||
next_id += 1
|
||||
stream = _page_stream(lines, draw_image=include_image)
|
||||
xobject = f" /XObject << /Im1 {image_id} 0 R >>" if image_id else ""
|
||||
objects.append(
|
||||
(
|
||||
page_id,
|
||||
(
|
||||
f"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
|
||||
f"/Resources << /Font << /F1 7 0 R >> >> /Contents {content_id} 0 R >>"
|
||||
f"/Resources << /Font << /F1 {font_id} 0 R >>{xobject} >> "
|
||||
f"/Contents {content_id} 0 R >>"
|
||||
).encode("ascii"),
|
||||
)
|
||||
)
|
||||
@@ -179,6 +231,44 @@ def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path:
|
||||
+ b"\nendstream",
|
||||
)
|
||||
)
|
||||
if image_id:
|
||||
image_stream = b"\x00\x00\x00"
|
||||
objects.append(
|
||||
(
|
||||
image_id,
|
||||
b"<< /Type /XObject /Subtype /Image /Width 1 /Height 1 "
|
||||
b"/ColorSpace /DeviceGray /BitsPerComponent 8 /Length "
|
||||
+ str(len(image_stream)).encode("ascii")
|
||||
+ b" >>\nstream\n"
|
||||
+ image_stream
|
||||
+ b"\nendstream",
|
||||
)
|
||||
)
|
||||
|
||||
if embedded_file:
|
||||
embedded_id = next_id
|
||||
filespec_id = next_id + 1
|
||||
next_id += 2
|
||||
embedded_stream = b"attached text"
|
||||
objects.append(
|
||||
(
|
||||
embedded_id,
|
||||
b"<< /Type /EmbeddedFile /Length "
|
||||
+ str(len(embedded_stream)).encode("ascii")
|
||||
+ b" >>\nstream\n"
|
||||
+ embedded_stream
|
||||
+ b"\nendstream",
|
||||
)
|
||||
)
|
||||
objects.append(
|
||||
(
|
||||
filespec_id,
|
||||
(
|
||||
f"<< /Type /Filespec /F (attachment.txt) "
|
||||
f"/EF << /F {embedded_id} 0 R >> >>"
|
||||
).encode("ascii"),
|
||||
)
|
||||
)
|
||||
|
||||
objects.extend(
|
||||
[
|
||||
@@ -190,9 +280,9 @@ def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path:
|
||||
f"/Count {len(page_refs)} >>"
|
||||
).encode("ascii"),
|
||||
),
|
||||
(7, b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>"),
|
||||
(font_id, b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>"),
|
||||
(
|
||||
8,
|
||||
info_id,
|
||||
b"<< /Title (PDF Fixture) /Author (Ada Lovelace) "
|
||||
b"/Subject (Source Adapter Test) /Keywords (markitect pdf) "
|
||||
b"/Producer (markitect-filter tests) /CreationDate (D:20260514093000Z) >>",
|
||||
@@ -200,7 +290,7 @@ def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path:
|
||||
]
|
||||
)
|
||||
if encrypted:
|
||||
objects.append((9, b"<< /Filter /Standard /V 1 /R 2 >>"))
|
||||
objects.append((encrypt_id, b"<< /Filter /Standard /V 1 /R 2 >>"))
|
||||
objects.sort(key=lambda item: item[0])
|
||||
|
||||
header = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"
|
||||
@@ -218,9 +308,9 @@ def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path:
|
||||
content.extend(b"0000000000 65535 f \n")
|
||||
for object_id in range(1, max_id + 1):
|
||||
content.extend(f"{offsets.get(object_id, 0):010d} 00000 n \n".encode("ascii"))
|
||||
trailer = f"trailer\n<< /Size {max_id + 1} /Root 1 0 R /Info 8 0 R".encode("ascii")
|
||||
trailer = f"trailer\n<< /Size {max_id + 1} /Root 1 0 R /Info {info_id} 0 R".encode("ascii")
|
||||
if encrypted:
|
||||
trailer += b" /Encrypt 9 0 R"
|
||||
trailer += f" /Encrypt {encrypt_id} 0 R".encode("ascii")
|
||||
trailer += b" >>\n"
|
||||
content.extend(trailer)
|
||||
content.extend(f"startxref\n{xref_offset}\n%%EOF\n".encode("ascii"))
|
||||
@@ -228,13 +318,15 @@ def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path:
|
||||
return pdf_path
|
||||
|
||||
|
||||
def _page_stream(lines: list[str]) -> bytes:
|
||||
def _page_stream(lines: list[str], *, draw_image: bool = False) -> bytes:
|
||||
parts = ["BT", "/F1 12 Tf", "72 720 Td"]
|
||||
for index, line in enumerate(lines):
|
||||
if index:
|
||||
parts.append("T*")
|
||||
parts.append(f"({_pdf_literal(line)}) Tj")
|
||||
parts.append("ET")
|
||||
if draw_image:
|
||||
parts.extend(["q", "10 0 0 10 72 640 cm", "/Im1 Do", "Q"])
|
||||
return "\n".join(parts).encode("ascii")
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user