diff --git a/README.md b/README.md
index a78f0ba..84bfa41 100644
--- a/README.md
+++ b/README.md
@@ -28,3 +28,9 @@ pdf = "markitect_filter.adapters:pdf_adapter_descriptor"
The first PDF slice is stdlib-only and targets deterministic text extraction
from local, digitally-readable PDFs. OCR, scanned-document recognition, and
layout-perfect reconstruction are intentionally deferred.
+
+Read-side attachment metadata is exposed through
+`NormalizedMarkdownDocument.attachments` for EPUB3 package resources, PDF
+embedded files, and PDF image-resource signals. See
+`docs/source-attachment-metadata.md` for the handoff contract to passive render
+asset manifests.
diff --git a/docs/pdf-adapter.md b/docs/pdf-adapter.md
index 4e96052..b22dcdd 100644
--- a/docs/pdf-adapter.md
+++ b/docs/pdf-adapter.md
@@ -23,7 +23,7 @@ native system services, or renderer-specific tooling.
- Scanned or image-only PDFs that require OCR.
- Encrypted or permission-restricted PDFs.
- Pixel-perfect layout reconstruction.
-- Table, figure, annotation, form, signature, and attachment extraction.
+- Table, figure, annotation, form, signature, and rich attachment extraction.
- PDF writing/export.
## Options
@@ -43,3 +43,9 @@ and originating PDF page object id.
Quality metadata records the extraction backend, document page count, selected
pages, extracted page count, page coverage, skipped pages, warning count,
lossiness, and confidence.
+
+`NormalizedMarkdownDocument.attachments` may include read-side metadata for
+embedded file streams and image-resource signals when the stdlib parser can
+detect them. Embedded files include byte size and digest. Image resources are
+signal-only descriptors with page/object provenance; the adapter does not
+extract image bytes or perform OCR.
diff --git a/docs/source-attachment-metadata.md b/docs/source-attachment-metadata.md
new file mode 100644
index 0000000..16af021
--- /dev/null
+++ b/docs/source-attachment-metadata.md
@@ -0,0 +1,81 @@
+# Source Attachment Metadata
+
+`markitect-filter` exposes read-side attachment metadata through
+`NormalizedMarkdownDocument.attachments`. These entries are
+`markitect_tool.source.SourceAsset` objects, so `markitect-tool` can consume
+them when building passive render asset manifests.
+
+The metadata schema marker is:
+
+```text
+markitect-filter.source-attachment.v1
+```
+
+## Common Fields
+
+Attachment entries should preserve:
+
+- `uri`: stable source package or document member URI
+- `path`: package member path or signal path
+- `name`: member filename or signal label
+- `media_type` and `extension` when known
+- `size` and `digest` when bytes are available
+- `metadata.source_adapter`: adapter id such as `source.epub3` or `source.pdf`
+- `metadata.source_role`: logical read-side role
+- `metadata.package_path`, `metadata.page`, `metadata.pdf_object`, or related
+ provenance coordinates when known
+- `metadata.render_manifest_compatible: true` when the entry can feed
+ `RenderAsset.from_source_asset`
+
+These entries describe source-side resources only. They do not imply output
+paths, copy execution, final artifact locations, or publication state.
+
+## EPUB3
+
+The EPUB3 adapter records manifest resources for images, stylesheets, fonts,
+audio, and video when the package entry exists and can be read cheaply from the
+ZIP archive. It stores byte size and sha256 digest for each collected resource.
+
+Unsupported non-XHTML package resources produce
+`source.epub3.skipped_resource` warnings. Declared but missing resources produce
+`source.epub3.missing_resource` warnings.
+
+## PDF
+
+The PDF adapter records embedded file streams when a stdlib scan can identify
+`Filespec` and `EmbeddedFile` objects. It stores member bytes, media type by
+filename, size, digest, object id, and source role `embedded-file`.
+
+For image resources, the stdlib slice records signal-only entries with source
+role `image-signal`. These entries preserve page/object provenance and a stable
+digest of the detected page/resource signal, but they do not extract image
+bytes. Image signals emit `source.pdf.image_resource_signal` warnings so callers
+know the adapter detected media that it did not extract.
+
+## Render Manifest Handoff
+
+`markitect-tool` can convert attachment entries to passive render assets:
+
+```python
+from markitect_tool.render import RenderAsset
+
+render_assets = [
+ RenderAsset.from_source_asset(asset, role=asset.metadata["source_role"])
+ for asset in document.attachments
+]
+```
+
+The resulting render assets remain passive descriptors. Asset copying,
+renderer output references, link rewriting, and final artifact validation stay
+outside `markitect-filter`.
+
+Example normalized attachment envelopes live in:
+
+- `examples/source-attachments/epub3-attachments.normalized.yaml`
+- `examples/source-attachments/pdf-attachments.normalized.yaml`
+
+Cross-repo validation can be run from this checkout with:
+
+```bash
+PYTHONPATH=src:/home/worsch/markitect-tool/src python3 -m pytest
+```
diff --git a/examples/source-attachments/epub3-attachments.normalized.yaml b/examples/source-attachments/epub3-attachments.normalized.yaml
new file mode 100644
index 0000000..b7a2c29
--- /dev/null
+++ b/examples/source-attachments/epub3-attachments.normalized.yaml
@@ -0,0 +1,40 @@
+schema_version: markitect.source.v1
+document_id: source.epub3:fixture
+adapter:
+ id: source.epub3
+ version: "1"
+attachments:
+ - uri: fixture.epub!/EPUB/images/chart.png
+ path: EPUB/images/chart.png
+ name: chart.png
+ media_type: image/png
+ extension: .png
+ size: 15
+ digest: sha256:example-chart
+ metadata:
+ schema_version: markitect-filter.source-attachment.v1
+ source_adapter: source.epub3
+ source_role: image
+ package_path: EPUB/images/chart.png
+ href: images/chart.png
+ manifest_id: chart
+ render_manifest_compatible: true
+ - uri: fixture.epub!/EPUB/styles/book.css
+ path: EPUB/styles/book.css
+ name: book.css
+ media_type: text/css
+ extension: .css
+ size: 22
+ digest: sha256:example-css
+ metadata:
+ schema_version: markitect-filter.source-attachment.v1
+ source_adapter: source.epub3
+ source_role: stylesheet
+ package_path: EPUB/styles/book.css
+ href: styles/book.css
+ manifest_id: style
+ render_manifest_compatible: true
+render_asset_manifest_handoff:
+ compatible_schema: markitect.render.reference.v1
+ core_asset_copying: false
+ renderer_required: false
diff --git a/examples/source-attachments/pdf-attachments.normalized.yaml b/examples/source-attachments/pdf-attachments.normalized.yaml
new file mode 100644
index 0000000..ccd27ce
--- /dev/null
+++ b/examples/source-attachments/pdf-attachments.normalized.yaml
@@ -0,0 +1,40 @@
+schema_version: markitect.source.v1
+document_id: source.pdf:fixture
+adapter:
+ id: source.pdf
+ version: "1"
+attachments:
+ - uri: fixture.pdf!/embedded/attachment.txt
+ path: embedded/attachment.txt
+ name: attachment.txt
+ media_type: text/plain
+ extension: .txt
+ size: 13
+ digest: sha256:example-embedded-file
+ metadata:
+ schema_version: markitect-filter.source-attachment.v1
+ source_adapter: source.pdf
+ source_role: embedded-file
+ package_path: embedded/attachment.txt
+ pdf_object: 8
+ embedded_file_name: attachment.txt
+ render_manifest_compatible: true
+ - uri: fixture.pdf#page-0001/image-signal
+ path: page-0001/image-signal
+ name: image-signal
+ media_type: application/x.markitect-pdf-image-signal
+ digest: sha256:example-image-signal
+ metadata:
+ schema_version: markitect-filter.source-attachment.v1
+ source_adapter: source.pdf
+ source_role: image-signal
+ signal_only: true
+ page: 1
+ pdf_object: 3
+ image_objects:
+ - 5
+ render_manifest_compatible: true
+render_asset_manifest_handoff:
+ compatible_schema: markitect.render.reference.v1
+ core_asset_copying: false
+ renderer_required: false
diff --git a/src/markitect_filter/__init__.py b/src/markitect_filter/__init__.py
index 550b48b..8442607 100644
--- a/src/markitect_filter/__init__.py
+++ b/src/markitect_filter/__init__.py
@@ -1,5 +1,10 @@
"""Concrete source-format adapters for Markitect."""
from markitect_filter.adapters import epub3_adapter_descriptor, pdf_adapter_descriptor
+from markitect_filter.assets import SOURCE_ATTACHMENT_METADATA_VERSION
-__all__ = ["epub3_adapter_descriptor", "pdf_adapter_descriptor"]
+__all__ = [
+ "SOURCE_ATTACHMENT_METADATA_VERSION",
+ "epub3_adapter_descriptor",
+ "pdf_adapter_descriptor",
+]
diff --git a/src/markitect_filter/adapters.py b/src/markitect_filter/adapters.py
index 940b27c..37b5be9 100644
--- a/src/markitect_filter/adapters.py
+++ b/src/markitect_filter/adapters.py
@@ -41,12 +41,15 @@ def epub3_adapter_descriptor() -> SourceAdapterDescriptor:
},
quality_profile={
"text_extraction": "stdlib-xhtml",
- "images": "metadata-only",
- "styles": "ignored",
+ "images": "metadata-with-digest",
+ "styles": "metadata-with-digest",
+ "fonts": "metadata-with-digest",
+ "attachments": "read-side-source-assets",
},
metadata={
"format": "EPUB3",
"dependency_profile": "stdlib",
+ "render_asset_manifest_compatible": True,
},
)
@@ -96,12 +99,14 @@ def pdf_adapter_descriptor() -> SourceAdapterDescriptor:
},
quality_profile={
"text_extraction": "stdlib-pdf-text",
- "images": "diagnostic-only",
+ "attachments": "metadata-with-digest",
+ "images": "signal-only",
"styles": "ignored",
"tables": "plain-text-only",
},
metadata={
"format": "PDF",
"dependency_profile": "stdlib",
+ "render_asset_manifest_compatible": True,
},
)
diff --git a/src/markitect_filter/assets.py b/src/markitect_filter/assets.py
new file mode 100644
index 0000000..3ecf253
--- /dev/null
+++ b/src/markitect_filter/assets.py
@@ -0,0 +1,88 @@
+"""Read-side source asset metadata helpers."""
+
+from __future__ import annotations
+
+import hashlib
+import mimetypes
+import posixpath
+from pathlib import PurePosixPath
+from typing import Any
+
+from markitect_tool.source import SourceAsset
+
+
+SOURCE_ATTACHMENT_METADATA_VERSION = "markitect-filter.source-attachment.v1"
+
+
+def bytes_digest(data: bytes) -> str:
+ """Return a Markitect-compatible sha256 digest for source bytes."""
+
+ return "sha256:" + hashlib.sha256(data).hexdigest()
+
+
+def source_member_uri(container_uri: str, member_path: str) -> str:
+ """Return a stable URI for a source package member."""
+
+ return f"{container_uri}!/{member_path}"
+
+
+def source_asset_from_member(
+ *,
+ container_uri: str,
+ member_path: str,
+ data: bytes,
+ media_type: str | None,
+ source_adapter: str,
+ source_role: str,
+ metadata: dict[str, Any] | None = None,
+) -> SourceAsset:
+ """Build read-side metadata for a package member without extracting it."""
+
+ name = PurePosixPath(member_path).name or member_path
+ extension = PurePosixPath(name).suffix.lower() or None
+ resolved_media_type = media_type or mimetypes.guess_type(name)[0] or "application/octet-stream"
+ return SourceAsset(
+ uri=source_member_uri(container_uri, member_path),
+ path=member_path,
+ name=name,
+ media_type=resolved_media_type,
+ extension=extension,
+ size=len(data),
+ digest=bytes_digest(data),
+ metadata={
+ "schema_version": SOURCE_ATTACHMENT_METADATA_VERSION,
+ "source_adapter": source_adapter,
+ "source_role": source_role,
+ "package_path": posixpath.normpath(member_path),
+ **(metadata or {}),
+ },
+ )
+
+
+def source_asset_signal(
+ *,
+ container_uri: str,
+ signal_id: str,
+ media_type: str,
+ source_adapter: str,
+ source_role: str,
+ digest_parts: list[bytes],
+ metadata: dict[str, Any] | None = None,
+) -> SourceAsset:
+ """Build metadata for a detected source resource signal without bytes."""
+
+ digest = bytes_digest(b"\n".join(digest_parts))
+ return SourceAsset(
+ uri=f"{container_uri}#{signal_id}",
+ path=signal_id,
+ name=signal_id.rsplit("/", 1)[-1],
+ media_type=media_type,
+ digest=digest,
+ metadata={
+ "schema_version": SOURCE_ATTACHMENT_METADATA_VERSION,
+ "source_adapter": source_adapter,
+ "source_role": source_role,
+ "signal_only": True,
+ **(metadata or {}),
+ },
+ )
diff --git a/src/markitect_filter/epub3.py b/src/markitect_filter/epub3.py
index 5db4fcf..fc04dab 100644
--- a/src/markitect_filter/epub3.py
+++ b/src/markitect_filter/epub3.py
@@ -28,12 +28,29 @@ from markitect_tool.source import (
)
from markitect_filter.adapters import epub3_adapter_descriptor
+from markitect_filter.assets import source_asset_from_member
XHTML_MEDIA_TYPES = {
"application/xhtml+xml",
"text/html",
}
+EPUB_ATTACHMENT_MEDIA_PREFIXES = (
+ "audio/",
+ "font/",
+ "image/",
+ "video/",
+)
+EPUB_ATTACHMENT_MEDIA_TYPES = {
+ "application/font-sfnt",
+ "application/vnd.ms-opentype",
+ "application/x-font-ttf",
+ "font/otf",
+ "font/ttf",
+ "font/woff",
+ "font/woff2",
+ "text/css",
+}
BOILERPLATE_HINTS = {
"cover",
"nav",
@@ -101,11 +118,15 @@ class Epub3ReadAdapter:
asset=request.asset,
adapter=_adapter_info(request.options),
metadata=metadata,
- capabilities=["read"],
+ capabilities=["read", "attachments"],
quality=NormalizationQuality(
lossiness="unknown" if has_error(diagnostics) else "low",
confidence=0.9 if not has_error(diagnostics) else 0.0,
warnings=_warning_count(diagnostics),
+ metadata={
+ "manifest_items": len(package.manifest) if package else 0,
+ "attachment_candidates": _attachment_candidate_count(package.manifest) if package else 0,
+ },
),
diagnostics=diagnostics,
valid=not has_error(diagnostics),
@@ -122,6 +143,7 @@ class Epub3ReadAdapter:
skip_boilerplate = bool(request.options.get("skip_boilerplate", True))
try:
with zipfile.ZipFile(Path(request.asset.path or request.asset.uri)) as archive:
+ attachments = _extract_attachments(archive, request.asset, package, diagnostics)
for order, item_id in enumerate(package.spine):
item = package.manifest.get(item_id)
if item is None:
@@ -187,7 +209,10 @@ class Epub3ReadAdapter:
confidence=0.9 if not has_error(diagnostics) else 0.0,
skipped_items=sum(1 for diagnostic in diagnostics if diagnostic.code == "source.epub3.skipped_boilerplate"),
warnings=_warning_count(diagnostics),
- metadata={"extraction": "epub3-stdlib-xhtml"},
+ metadata={
+ "extraction": "epub3-stdlib-xhtml",
+ "attachment_count": len(attachments),
+ },
)
adapter = _adapter_info(request.options)
document = NormalizedMarkdownDocument(
@@ -203,9 +228,10 @@ class Epub3ReadAdapter:
source_uri=request.asset.uri,
source_path=request.asset.path,
digest=request.asset.digest,
- metadata={"rootfile": package.rootfile_path},
+ metadata={"rootfile": package.rootfile_path, "attachment_count": len(attachments)},
)
],
+ attachments=attachments,
adapter=adapter,
cache_key=normalization_cache_key(
asset=request.asset,
@@ -393,6 +419,97 @@ def _extract_nav_labels(
return labels
+def _extract_attachments(
+ archive: zipfile.ZipFile,
+ asset: SourceAsset,
+ package: EpubPackage,
+ diagnostics: list[Diagnostic],
+) -> list[SourceAsset]:
+ attachments: list[SourceAsset] = []
+ for item in sorted(package.manifest.values(), key=lambda value: value.get("href", "")):
+ media_type = item.get("media_type", "")
+ href = item.get("href", "")
+ if not href or media_type in XHTML_MEDIA_TYPES:
+ continue
+ package_path = _resolve_package_path(package.rootfile_path, href)
+ if not _is_attachment_media(media_type):
+ diagnostics.append(
+ _warning(
+ asset,
+ "source.epub3.skipped_resource",
+ f"Skipped unsupported EPUB resource media type `{media_type}`.",
+ details={
+ "href": href,
+ "package_path": package_path,
+ "media_type": media_type,
+ "manifest_id": item.get("id"),
+ },
+ )
+ )
+ continue
+ try:
+ data = archive.read(package_path)
+ except KeyError:
+ diagnostics.append(
+ _warning(
+ asset,
+ "source.epub3.missing_resource",
+ f"EPUB resource `{package_path}` is declared but missing.",
+ details={
+ "href": href,
+ "package_path": package_path,
+ "media_type": media_type,
+ "manifest_id": item.get("id"),
+ },
+ )
+ )
+ continue
+ attachments.append(
+ source_asset_from_member(
+ container_uri=asset.uri,
+ member_path=package_path,
+ data=data,
+ media_type=media_type,
+ source_adapter="source.epub3",
+ source_role=_attachment_role(media_type),
+ metadata={
+ "href": href,
+ "manifest_id": item.get("id"),
+ "properties": item.get("properties", ""),
+ "container_path": asset.path,
+ "render_manifest_compatible": True,
+ },
+ )
+ )
+ return attachments
+
+
+def _attachment_candidate_count(manifest: dict[str, dict[str, str]]) -> int:
+ return sum(1 for item in manifest.values() if _is_attachment_media(item.get("media_type", "")))
+
+
+def _is_attachment_media(media_type: str) -> bool:
+ normalized = media_type.lower()
+ return normalized in EPUB_ATTACHMENT_MEDIA_TYPES or any(
+ normalized.startswith(prefix) for prefix in EPUB_ATTACHMENT_MEDIA_PREFIXES
+ )
+
+
+def _attachment_role(media_type: str) -> str:
+ normalized = media_type.lower()
+ if normalized.startswith("image/"):
+ return "image"
+ if normalized == "text/css":
+ return "stylesheet"
+ if normalized.startswith("font/") or "font" in normalized or "opentype" in normalized:
+ return "font"
+ if normalized.startswith("audio/"):
+ return "audio"
+ if normalized.startswith("video/"):
+ return "video"
+ return "package-resource"
+
+
def _extract_segment(
archive: zipfile.ZipFile,
asset: SourceAsset,
diff --git a/src/markitect_filter/pdf.py b/src/markitect_filter/pdf.py
index e1af289..0ed9359 100644
--- a/src/markitect_filter/pdf.py
+++ b/src/markitect_filter/pdf.py
@@ -26,6 +26,7 @@ from markitect_tool.source import (
)
from markitect_filter.adapters import pdf_adapter_descriptor
+from markitect_filter.assets import bytes_digest, source_asset_signal, source_asset_from_member
PDF_HEADER_RE = re.compile(rb"%PDF-\d\.\d")
@@ -36,6 +37,9 @@ PAGES_TYPE_RE = re.compile(rb"/Type\s*/Pages\b")
REF_RE = re.compile(rb"(\d+)\s+\d+\s+R")
INFO_REF_RE = re.compile(rb"/Info\s+(\d+)\s+\d+\s+R")
COUNT_RE = re.compile(rb"/Count\s+(\d+)")
+EMBEDDED_FILE_RE = re.compile(rb"/Type\s*/EmbeddedFile\b")
+FILESPEC_RE = re.compile(rb"/Type\s*/Filespec\b")
+EMBEDDED_FILE_REF_RE = re.compile(rb"/EF\s*<<.*?/F\s+(\d+)\s+\d+\s+R", re.DOTALL)
@dataclass(frozen=True)
@@ -53,6 +57,7 @@ class PdfPackage:
encrypted: bool
pages: list[PdfPage]
diagnostics: list[Diagnostic]
+ attachments: list[SourceAsset]
class PdfReadAdapter:
@@ -92,7 +97,7 @@ class PdfReadAdapter:
asset=request.asset,
adapter=_adapter_info(request.options),
metadata=package.metadata,
- capabilities=["read"],
+ capabilities=["read", "attachments"],
quality=NormalizationQuality(
lossiness="unknown" if has_error(diagnostics) else "medium",
confidence=_confidence(package, diagnostics),
@@ -102,6 +107,9 @@ class PdfReadAdapter:
"page_count": package.page_count,
"pages_with_text": extracted_pages,
"encrypted": package.encrypted,
+ "attachment_count": len(package.attachments),
+ "image_signal_count": _attachment_role_count(package.attachments, "image-signal"),
+ "embedded_file_count": _attachment_role_count(package.attachments, "embedded-file"),
},
),
diagnostics=diagnostics,
@@ -188,6 +196,9 @@ class PdfReadAdapter:
"selected_pages": [page.number for page in selected_pages],
"pages_extracted": len(segments),
"page_coverage": page_coverage,
+ "attachment_count": len(package.attachments),
+ "image_signal_count": _attachment_role_count(package.attachments, "image-signal"),
+ "embedded_file_count": _attachment_role_count(package.attachments, "embedded-file"),
},
)
document = NormalizedMarkdownDocument(
@@ -203,9 +214,10 @@ class PdfReadAdapter:
source_uri=request.asset.uri,
source_path=request.asset.path,
digest=request.asset.digest,
- metadata={"page_count": package.page_count},
+ metadata={"page_count": package.page_count, "attachment_count": len(package.attachments)},
)
],
+ attachments=package.attachments,
adapter=_adapter_info(request.options),
cache_key=normalization_cache_key(
asset=request.asset,
@@ -227,6 +239,7 @@ def _load_pdf(asset: SourceAsset) -> PdfPackage:
page_count=0,
encrypted=False,
pages=[],
+ attachments=[],
diagnostics=[
_pdf_error(
asset,
@@ -243,6 +256,7 @@ def _load_pdf(asset: SourceAsset) -> PdfPackage:
page_count=0,
encrypted=False,
pages=[],
+ attachments=[],
diagnostics=[_malformed(asset, "PDF does not start with a PDF header.")],
)
@@ -255,6 +269,7 @@ def _load_pdf(asset: SourceAsset) -> PdfPackage:
page_count=_page_count(objects),
encrypted=True,
pages=[],
+ attachments=[],
diagnostics=[
_pdf_error(
asset,
@@ -267,10 +282,30 @@ def _load_pdf(asset: SourceAsset) -> PdfPackage:
page_ids = _page_object_ids(objects)
page_count = _page_count(objects) or len(page_ids)
pages: list[PdfPage] = []
+ attachments = _embedded_file_assets(objects, asset, diagnostics)
for page_number, object_id in enumerate(page_ids, start=1):
page_body = objects[object_id]
page_diagnostics: list[Diagnostic] = []
content_ids = _content_refs(page_body)
+ image_object_ids = _image_object_ids(page_body, objects, content_ids)
+ if image_object_ids:
+ attachments.append(
+ _image_signal_asset(
+ asset,
+ page_number=page_number,
+ page_object_id=object_id,
+ image_object_ids=image_object_ids,
+ digest_parts=[page_body, *[objects.get(image_id, b"") for image_id in image_object_ids]],
+ )
+ )
+ page_diagnostics.append(
+ _warning(
+ asset,
+ "source.pdf.image_resource_signal",
+ f"PDF page {page_number} references image resources; binary extraction is not performed.",
+ details={"page": page_number, "image_objects": image_object_ids},
+ )
+ )
text_parts: list[str] = []
if not content_ids and STREAM_RE.search(page_body):
stream = _stream_data(page_body, asset, page_diagnostics)
@@ -312,6 +347,126 @@ def _load_pdf(asset: SourceAsset) -> PdfPackage:
encrypted=False,
pages=pages,
diagnostics=diagnostics,
+ attachments=attachments,
+ )
+
+
+def _embedded_file_assets(
+ objects: dict[int, bytes],
+ asset: SourceAsset,
+ diagnostics: list[Diagnostic],
+) -> list[SourceAsset]:
+ file_names = _embedded_file_names(objects)
+ attachments: list[SourceAsset] = []
+ for object_id, body in sorted(objects.items()):
+ if not EMBEDDED_FILE_RE.search(body):
+ continue
+ attachment_diagnostics: list[Diagnostic] = []
+ stream = _stream_data(body, asset, attachment_diagnostics)
+ diagnostics.extend(attachment_diagnostics)
+ if not stream:
+ diagnostics.append(
+ _warning(
+ asset,
+ "source.pdf.embedded_file_unreadable",
+ f"PDF embedded file object {object_id} does not expose readable bytes.",
+ details={"object_id": object_id},
+ )
+ )
+ continue
+ name = file_names.get(object_id) or f"embedded-{object_id}.bin"
+ attachments.append(
+ source_asset_from_member(
+ container_uri=asset.uri,
+ member_path=f"embedded/{name}",
+ data=stream,
+ media_type=None,
+ source_adapter="source.pdf",
+ source_role="embedded-file",
+ metadata={
+ "container_path": asset.path,
+ "pdf_object": object_id,
+ "embedded_file_name": name,
+ "render_manifest_compatible": True,
+ },
+ )
+ )
+ return attachments
+
+
+def _embedded_file_names(objects: dict[int, bytes]) -> dict[int, str]:
+ names: dict[int, str] = {}
+ for body in objects.values():
+ if not FILESPEC_RE.search(body):
+ continue
+ ref_match = EMBEDDED_FILE_REF_RE.search(body)
+ if ref_match is None:
+ continue
+ object_id = int(ref_match.group(1))
+ names[object_id] = _file_spec_name(body) or f"embedded-{object_id}.bin"
+ return names
+
+
+def _file_spec_name(body: bytes) -> str | None:
+ for key in ("UF", "F"):
+ literal_match = re.search(rb"/" + key.encode("ascii") + rb"\s*(\()", body)
+ if literal_match:
+ value, _ = _read_literal_string(body, literal_match.start(1))
+ else:
+ value = _metadata_value(body, key)
+ if value:
+ return re.sub(r"[\\/:]+", "-", value).strip() or None
+ return None
+
+
+def _image_object_ids(
+ page_body: bytes,
+ objects: dict[int, bytes],
+ content_ids: list[int],
+) -> list[int]:
+ page_refs = {
+ int(match.group(1))
+ for match in REF_RE.finditer(page_body)
+ }
+ image_refs = sorted(
+ object_id
+ for object_id in page_refs
+ if re.search(rb"/Subtype\s*/Image\b", objects.get(object_id, b""))
+ )
+ if image_refs:
+ return image_refs
+ haystack = page_body + b"\n" + b"\n".join(objects.get(ref, b"") for ref in content_ids)
+ if not re.search(rb"/Subtype\s*/Image\b|\bDo\b", haystack):
+ return []
+ return sorted(
+ object_id
+ for object_id, body in objects.items()
+ if re.search(rb"/Subtype\s*/Image\b", body)
+ )
+
+
+def _image_signal_asset(
+ asset: SourceAsset,
+ *,
+ page_number: int,
+ page_object_id: int,
+ image_object_ids: list[int],
+ digest_parts: list[bytes],
+) -> SourceAsset:
+ return source_asset_signal(
+ container_uri=asset.uri,
+ signal_id=f"page-{page_number:04d}/image-signal",
+ media_type="application/x.markitect-pdf-image-signal",
+ source_adapter="source.pdf",
+ source_role="image-signal",
+ digest_parts=digest_parts or [bytes_digest(",".join(map(str, image_object_ids)).encode("ascii")).encode("ascii")],
+ metadata={
+ "container_path": asset.path,
+ "page": page_number,
+ "pdf_object": page_object_id,
+ "image_objects": image_object_ids,
+ "render_manifest_compatible": True,
+ },
)
@@ -733,6 +888,10 @@ def _confidence(package: PdfPackage, diagnostics: list[Diagnostic]) -> float:
return max(0.1, 0.75 * coverage)
+def _attachment_role_count(attachments: list[SourceAsset], role: str) -> int:
+ return sum(1 for attachment in attachments if attachment.metadata.get("source_role") == role)
+
+
def _warning(
asset: SourceAsset,
code: str,
diff --git a/tests/test_epub3_adapter.py b/tests/test_epub3_adapter.py
index 2f753d0..c156d38 100644
--- a/tests/test_epub3_adapter.py
+++ b/tests/test_epub3_adapter.py
@@ -31,6 +31,8 @@ def test_epub3_descriptor_matches_contract():
assert descriptor.extensions == [".epub"]
assert descriptor.safety["network"] is False
assert descriptor.option_schema["properties"]["skip_boilerplate"]["default"] is True
+ assert descriptor.quality_profile["attachments"] == "read-side-source-assets"
+ assert descriptor.metadata["render_asset_manifest_compatible"] is True
def test_epub3_adapter_matches_epub_assets(tmp_path: Path):
@@ -57,6 +59,7 @@ def test_epub3_adapter_inspects_metadata(tmp_path: Path):
assert result.metadata.language == "en"
assert result.metadata.identifiers["bookid"] == "urn:test-book"
assert result.quality.lossiness == "low"
+ assert result.quality.metadata["attachment_candidates"] == 2
def test_epub3_adapter_normalizes_spine_to_markdown(tmp_path: Path):
@@ -83,6 +86,16 @@ def test_epub3_adapter_normalizes_spine_to_markdown(tmp_path: Path):
"continuation",
]
assert result.document.segments[0].provenance[0].package_path == "EPUB/chapter1.xhtml"
+ assert [attachment.metadata["source_role"] for attachment in result.document.attachments] == [
+ "image",
+ "stylesheet",
+ ]
+ assert result.document.attachments[0].media_type == "image/png"
+ assert result.document.attachments[1].media_type == "text/css"
+ assert result.document.attachments[0].digest.startswith("sha256:")
+ assert result.document.attachments[0].metadata["package_path"] == "EPUB/images/chart.png"
+ assert result.document.attachments[0].metadata["render_manifest_compatible"] is True
+ assert result.document.quality.metadata["attachment_count"] == 2
assert result.document.quality.lossiness == "none"
@@ -114,13 +127,28 @@ def test_epub3_adapter_reports_malformed_missing_container(tmp_path: Path):
assert "container.xml" in result.diagnostics[0].message
+def test_epub3_adapter_reports_unsupported_package_resources(tmp_path: Path):
+ epub_path = _write_epub(tmp_path, include_unsupported_resource=True)
+ asset = SourceAsset.from_path(epub_path, media_type="application/epub+zip")
+ adapter = epub3_adapter_descriptor().instantiate()
+
+ result = adapter.read(SourceReadRequest(asset=asset))
+
+ assert result.is_valid
+ assert result.document is not None
+ assert any(
+ diagnostic.code == "source.epub3.skipped_resource"
+ for diagnostic in result.document.diagnostics
+ )
+
+
def test_epub3_entry_point_discovery_shape():
registry = discover_source_adapters([FakeEntryPoint()])
assert registry.get("source.epub3").name == "EPUB3"
-def _write_epub(tmp_path: Path) -> Path:
+def _write_epub(tmp_path: Path, *, include_unsupported_resource: bool = False) -> Path:
epub_path = tmp_path / "test-book.epub"
with zipfile.ZipFile(epub_path, "w") as archive:
archive.writestr("mimetype", "application/epub+zip")
@@ -150,13 +178,22 @@ def _write_epub(tmp_path: Path) -> Path:
+
+
+ {unsupported}
-""",
+""".format(
+ unsupported=(
+ ' '
+ if include_unsupported_resource
+ else ""
+ )
+ ),
)
archive.writestr(
"EPUB/nav.xhtml",
@@ -203,4 +240,8 @@ def _write_epub(tmp_path: Path) -> Path: