feat(source): add pdf read adapter

This commit is contained in:
2026-05-14 23:33:31 +02:00
parent 24ee499b50
commit 0c9a418e85
8 changed files with 1176 additions and 13 deletions

View File

@@ -3,8 +3,11 @@
`markitect-filter` provides concrete source-format adapters for converting
external document formats into canonical Markitect Markdown representations.
The first adapter is `source.epub3`, a read-only EPUB3 adapter that implements
the `markitect-tool` source adapter contract.
The first adapters are read-only source adapters that implement the
`markitect-tool` source adapter contract:
- `source.epub3` for EPUB3 packages
- `source.pdf` for digitally-readable PDFs
## Development
@@ -19,4 +22,9 @@ The EPUB3 adapter is registered through:
```toml
[project.entry-points."markitect_tool.source_adapters"]
epub3 = "markitect_filter.adapters:epub3_adapter_descriptor"
pdf = "markitect_filter.adapters:pdf_adapter_descriptor"
```
The first PDF slice is stdlib-only and targets deterministic text extraction
from local, digitally-readable PDFs. OCR, scanned-document recognition, and
layout-perfect reconstruction are intentionally deferred.

45
docs/pdf-adapter.md Normal file
View File

@@ -0,0 +1,45 @@
# PDF Adapter
`source.pdf` is a read-only Markitect source adapter for local,
digitally-readable PDF files.
## Dependency Policy
The first implementation is stdlib-only. The `pdf` optional dependency extra is
present so a richer pure-Python backend can be added later without changing the
adapter boundary or making PDF support mandatory for EPUB3 users.
The adapter does not use network access, external processes, OCR engines,
native system services, or renderer-specific tooling.
## Supported Inputs
- Local files with media type `application/pdf` or extension `.pdf`.
- PDFs with extractable text in page content streams.
- Plain and FlateDecode content streams for the first deterministic slice.
## Deferred Inputs
- Scanned or image-only PDFs that require OCR.
- Encrypted or permission-restricted PDFs.
- Pixel-perfect layout reconstruction.
- Table, figure, annotation, form, signature, and attachment extraction.
- PDF writing/export.
## Options
- `page_range`: optional 1-based page range such as `1-3,5`.
- `include_page_breaks`: when true, prefixes each page segment with a Markdown
page marker comment.
- `normalize_whitespace`: when true, collapses repeated horizontal whitespace
while preserving extracted line breaks.
## Provenance And Quality
The adapter emits one segment per extracted page. Each segment carries
page-level `SourceProvenance` with the source path, source digest, page number,
and originating PDF page object id.
Quality metadata records the extraction backend, document page count, selected
pages, extracted page count, page coverage, skipped pages, warning count,
lossiness, and confidence.

View File

@@ -18,9 +18,11 @@ dev = [
"pytest>=8",
]
epub3 = []
pdf = []
[project.entry-points."markitect_tool.source_adapters"]
epub3 = "markitect_filter.adapters:epub3_adapter_descriptor"
pdf = "markitect_filter.adapters:pdf_adapter_descriptor"
[tool.setuptools.packages.find]
where = ["src"]

View File

@@ -1,5 +1,5 @@
"""Concrete source-format adapters for Markitect."""
from markitect_filter.adapters import epub3_adapter_descriptor
from markitect_filter.adapters import epub3_adapter_descriptor, pdf_adapter_descriptor
__all__ = ["epub3_adapter_descriptor"]
__all__ = ["epub3_adapter_descriptor", "pdf_adapter_descriptor"]

View File

@@ -49,3 +49,59 @@ def epub3_adapter_descriptor() -> SourceAdapterDescriptor:
"dependency_profile": "stdlib",
},
)
def pdf_adapter_descriptor() -> SourceAdapterDescriptor:
"""Return the lightweight PDF read adapter descriptor."""
def factory():
from markitect_filter.pdf import PdfReadAdapter
return PdfReadAdapter()
return SourceAdapterDescriptor(
id="source.pdf",
version="1",
name="PDF",
summary="Read digitally-readable PDFs into canonical Markitect Markdown.",
operations=["read"],
media_types=["application/pdf"],
extensions=[".pdf"],
factory=factory,
option_schema={
"type": "object",
"properties": {
"page_range": {
"type": "string",
"description": "Optional 1-based page range such as `1-3,5`.",
},
"include_page_breaks": {
"type": "boolean",
"default": False,
"description": "Prefix each page segment with a Markdown comment page marker.",
},
"normalize_whitespace": {
"type": "boolean",
"default": True,
"description": "Collapse repeated horizontal whitespace while preserving extracted line breaks.",
},
},
"additionalProperties": False,
},
safety={
"reads_files": True,
"writes_files": False,
"network": False,
"external_process": False,
},
quality_profile={
"text_extraction": "stdlib-pdf-text",
"images": "diagnostic-only",
"styles": "ignored",
"tables": "plain-text-only",
},
metadata={
"format": "PDF",
"dependency_profile": "stdlib",
},
)

782
src/markitect_filter/pdf.py Normal file
View File

@@ -0,0 +1,782 @@
"""PDF read adapter implementation."""
from __future__ import annotations
import re
import zlib
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from markitect_tool.diagnostics import Diagnostic, SourceLocation, has_error
from markitect_tool.source import (
NormalizationQuality,
NormalizedMarkdownDocument,
NormalizedMarkdownSegment,
SourceAdapterMatch,
SourceAdapterMatchRequest,
SourceAsset,
SourceInspectRequest,
SourceInspectResult,
SourceMetadata,
SourceProvenance,
SourceReadRequest,
SourceReadResult,
normalization_cache_key,
)
from markitect_filter.adapters import pdf_adapter_descriptor
PDF_HEADER_RE = re.compile(rb"%PDF-\d\.\d")
OBJECT_RE = re.compile(rb"(\d+)\s+(\d+)\s+obj\b(.*?)\bendobj", re.DOTALL)
STREAM_RE = re.compile(rb"stream\r?\n(.*?)\r?\nendstream", re.DOTALL)
PAGE_TYPE_RE = re.compile(rb"/Type\s*/Page\b")
PAGES_TYPE_RE = re.compile(rb"/Type\s*/Pages\b")
REF_RE = re.compile(rb"(\d+)\s+\d+\s+R")
INFO_REF_RE = re.compile(rb"/Info\s+(\d+)\s+\d+\s+R")
COUNT_RE = re.compile(rb"/Count\s+(\d+)")
@dataclass(frozen=True)
class PdfPage:
number: int
object_id: int
text: str
diagnostics: list[Diagnostic]
@dataclass(frozen=True)
class PdfPackage:
metadata: SourceMetadata
page_count: int
encrypted: bool
pages: list[PdfPage]
diagnostics: list[Diagnostic]
class PdfReadAdapter:
"""Read digitally-readable PDFs into normalized Markitect Markdown."""
def __init__(self) -> None:
self.descriptor = pdf_adapter_descriptor()
def can_read(self, request: SourceAdapterMatchRequest) -> SourceAdapterMatch:
asset = request.asset
if asset.media_type == "application/pdf":
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=True,
confidence=100,
reason="media_type",
)
if asset.extension == ".pdf":
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=True,
confidence=80,
reason="extension",
)
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=False,
confidence=0,
reason="unsupported",
)
def inspect(self, request: SourceInspectRequest) -> SourceInspectResult:
package = _load_pdf(request.asset)
diagnostics = package.diagnostics
extracted_pages = sum(1 for page in package.pages if page.text.strip())
return SourceInspectResult(
asset=request.asset,
adapter=_adapter_info(request.options),
metadata=package.metadata,
capabilities=["read"],
quality=NormalizationQuality(
lossiness="unknown" if has_error(diagnostics) else "medium",
confidence=_confidence(package, diagnostics),
warnings=_warning_count(diagnostics),
metadata={
"extraction": "pdf-stdlib-text",
"page_count": package.page_count,
"pages_with_text": extracted_pages,
"encrypted": package.encrypted,
},
),
diagnostics=diagnostics,
valid=not has_error(diagnostics),
)
def read(self, request: SourceReadRequest) -> SourceReadResult:
package = _load_pdf(request.asset)
if has_error(package.diagnostics):
return SourceReadResult(diagnostics=package.diagnostics, valid=False)
selected_pages, page_range_diagnostics = _select_pages(
package.pages,
request.options.get("page_range"),
request.asset,
)
diagnostics = [*package.diagnostics, *page_range_diagnostics]
if has_error(diagnostics):
return SourceReadResult(diagnostics=diagnostics, valid=False)
normalize_whitespace = bool(request.options.get("normalize_whitespace", True))
include_page_breaks = bool(request.options.get("include_page_breaks", False))
segments: list[NormalizedMarkdownSegment] = []
skipped_pages = 0
for page in selected_pages:
page_text = _normalize_markdown(page.text, normalize_whitespace)
diagnostics.extend(page.diagnostics)
if not page_text:
skipped_pages += 1
diagnostics.append(
_warning(
request.asset,
"source.pdf.empty_page",
f"PDF page {page.number} did not produce extractable text.",
details={"page": page.number},
)
)
continue
markdown = (
f"<!-- page: {page.number} -->\n\n{page_text}"
if include_page_breaks
else page_text
)
segments.append(
NormalizedMarkdownSegment(
segment_id=f"page-{page.number:04d}",
order=len(segments),
markdown=markdown,
provenance=[
SourceProvenance(
source_uri=request.asset.uri,
source_path=request.asset.path,
page=str(page.number),
digest=request.asset.digest,
metadata={"pdf_object": page.object_id},
)
],
metadata={"page": page.number, "pdf_object": page.object_id},
)
)
if not segments:
diagnostics.append(
_pdf_error(
request.asset,
"source.pdf.no_extractable_text",
"PDF did not produce any extractable Markdown text.",
details={"page_count": package.page_count},
)
)
return SourceReadResult(diagnostics=diagnostics, valid=False)
markdown = "\n\n".join(segment.markdown.strip() for segment in segments)
page_coverage = len(segments) / max(len(selected_pages), 1)
warning_count = _warning_count(diagnostics)
quality = NormalizationQuality(
lossiness="medium" if warning_count else "low",
confidence=max(0.1, 0.75 * page_coverage) if not has_error(diagnostics) else 0.0,
skipped_items=skipped_pages,
warnings=warning_count,
metadata={
"extraction": "pdf-stdlib-text",
"page_count": package.page_count,
"selected_pages": [page.number for page in selected_pages],
"pages_extracted": len(segments),
"page_coverage": page_coverage,
},
)
document = NormalizedMarkdownDocument(
document_id=_document_id(request.asset, package.metadata),
asset=request.asset,
metadata=package.metadata,
markdown=markdown,
segments=segments,
quality=quality,
diagnostics=diagnostics,
provenance=[
SourceProvenance(
source_uri=request.asset.uri,
source_path=request.asset.path,
digest=request.asset.digest,
metadata={"page_count": package.page_count},
)
],
adapter=_adapter_info(request.options),
cache_key=normalization_cache_key(
asset=request.asset,
adapter_id=self.descriptor.id,
adapter_version=self.descriptor.version,
options=request.options,
),
)
return SourceReadResult(document=document, diagnostics=[], valid=not has_error(diagnostics))
def _load_pdf(asset: SourceAsset) -> PdfPackage:
diagnostics: list[Diagnostic] = []
try:
data = Path(asset.path or asset.uri).read_bytes()
except OSError as exc:
return PdfPackage(
metadata=SourceMetadata(),
page_count=0,
encrypted=False,
pages=[],
diagnostics=[
_pdf_error(
asset,
"source.pdf.unreadable",
"PDF file could not be read.",
details={"error": str(exc)},
)
],
)
if not PDF_HEADER_RE.match(data.lstrip()):
return PdfPackage(
metadata=SourceMetadata(),
page_count=0,
encrypted=False,
pages=[],
diagnostics=[_malformed(asset, "PDF does not start with a PDF header.")],
)
objects = _parse_objects(data)
encrypted = bool(re.search(rb"/Encrypt\b", data))
metadata = _extract_metadata(data, objects, encrypted)
if encrypted:
return PdfPackage(
metadata=metadata,
page_count=_page_count(objects),
encrypted=True,
pages=[],
diagnostics=[
_pdf_error(
asset,
"source.pdf.encrypted",
"PDF is encrypted or declares an encryption dictionary.",
)
],
)
page_ids = _page_object_ids(objects)
page_count = _page_count(objects) or len(page_ids)
pages: list[PdfPage] = []
for page_number, object_id in enumerate(page_ids, start=1):
page_body = objects[object_id]
page_diagnostics: list[Diagnostic] = []
content_ids = _content_refs(page_body)
text_parts: list[str] = []
if not content_ids and STREAM_RE.search(page_body):
stream = _stream_data(page_body, asset, page_diagnostics)
if stream:
text_parts.append(_extract_stream_text(stream))
for content_id in content_ids:
content_body = objects.get(content_id)
if content_body is None:
page_diagnostics.append(
_warning(
asset,
"source.pdf.missing_content_stream",
f"PDF page {page_number} references missing content object {content_id}.",
details={"page": page_number, "object_id": content_id},
)
)
continue
stream = _stream_data(content_body, asset, page_diagnostics)
if stream:
text_parts.append(_extract_stream_text(stream))
text = "\n".join(part for part in text_parts if part.strip()).strip()
if _page_may_be_image_only(page_body, objects, content_ids) and not text:
page_diagnostics.append(
_warning(
asset,
"source.pdf.image_only_page",
f"PDF page {page_number} appears to contain image content without extractable text.",
details={"page": page_number},
)
)
pages.append(PdfPage(page_number, object_id, text, page_diagnostics))
if not page_ids:
diagnostics.append(_malformed(asset, "PDF does not declare any page objects."))
return PdfPackage(
metadata=metadata,
page_count=page_count,
encrypted=False,
pages=pages,
diagnostics=diagnostics,
)
def _parse_objects(data: bytes) -> dict[int, bytes]:
return {int(match.group(1)): match.group(3) for match in OBJECT_RE.finditer(data)}
def _page_object_ids(objects: dict[int, bytes]) -> list[int]:
return [
object_id
for object_id, body in sorted(objects.items())
if PAGE_TYPE_RE.search(body) and not PAGES_TYPE_RE.search(body)
]
def _page_count(objects: dict[int, bytes]) -> int:
counts = [
int(match.group(1))
for body in objects.values()
if PAGES_TYPE_RE.search(body)
for match in [COUNT_RE.search(body)]
if match is not None
]
return max(counts) if counts else 0
def _content_refs(page_body: bytes) -> list[int]:
refs: list[int] = []
array_match = re.search(rb"/Contents\s*\[(.*?)\]", page_body, re.DOTALL)
if array_match:
refs.extend(int(match.group(1)) for match in REF_RE.finditer(array_match.group(1)))
direct_match = re.search(rb"/Contents\s+(\d+)\s+\d+\s+R", page_body)
if direct_match:
ref = int(direct_match.group(1))
if ref not in refs:
refs.append(ref)
return refs
def _stream_data(
object_body: bytes,
asset: SourceAsset,
diagnostics: list[Diagnostic],
) -> bytes:
match = STREAM_RE.search(object_body)
if match is None:
diagnostics.append(
_warning(
asset,
"source.pdf.missing_stream",
"PDF content object does not contain a readable stream.",
)
)
return b""
stream = match.group(1)
if b"/FlateDecode" in object_body:
try:
return zlib.decompress(stream)
except zlib.error as exc:
diagnostics.append(
_warning(
asset,
"source.pdf.flate_decode_failed",
"PDF FlateDecode stream could not be decompressed.",
details={"error": str(exc)},
)
)
return b""
return stream
def _extract_metadata(
data: bytes,
objects: dict[int, bytes],
encrypted: bool,
) -> SourceMetadata:
info_body = b""
info_ref = INFO_REF_RE.search(data)
if info_ref:
info_body = objects.get(int(info_ref.group(1)), b"")
if not info_body:
info_body = data
title = _metadata_value(info_body, "Title")
author = _metadata_value(info_body, "Author")
creation_date = _metadata_value(info_body, "CreationDate")
modification_date = _metadata_value(info_body, "ModDate")
raw = {
key: value
for key, value in {
"subject": _metadata_value(info_body, "Subject"),
"keywords": _metadata_value(info_body, "Keywords"),
"producer": _metadata_value(info_body, "Producer"),
"creator": _metadata_value(info_body, "Creator"),
"creation_date": creation_date,
"modification_date": modification_date,
"encrypted": encrypted,
}.items()
if value not in (None, "")
}
return SourceMetadata(
title=title,
creators=[author] if author else [],
publication_date=creation_date,
raw=raw,
)
def _metadata_value(body: bytes, key: str) -> str | None:
key_bytes = f"/{key}".encode("ascii")
index = body.find(key_bytes)
if index < 0:
return None
index += len(key_bytes)
while index < len(body) and body[index] in b" \t\r\n":
index += 1
if index >= len(body):
return None
if body[index:index + 1] == b"(":
value, _ = _read_literal_string(body, index)
return _clean_text(value)
if body[index:index + 1] == b"<" and body[index:index + 2] != b"<<":
end = body.find(b">", index + 1)
if end > index:
return _clean_text(_decode_hex(body[index + 1:end]))
return None
def _extract_stream_text(stream: bytes) -> str:
tokens = _pdf_tokens(stream)
lines = [""]
last_array_start: int | None = None
for index, token in enumerate(tokens):
if token == "[":
last_array_start = index
continue
if token == "Tj":
_append_text(lines, _previous_text(tokens, index))
elif token == "TJ":
if last_array_start is not None:
text = "".join(
item[1]
for item in tokens[last_array_start:index]
if isinstance(item, tuple) and item[0] == "text"
)
_append_text(lines, text)
last_array_start = None
elif token == "'":
_new_line(lines)
_append_text(lines, _previous_text(tokens, index))
elif token == '"':
_new_line(lines)
_append_text(lines, _previous_text(tokens, index))
elif token in {"T*", "TD"}:
_new_line(lines)
elif token == "Td" and _td_moves_to_new_line(tokens, index):
_new_line(lines)
return "\n".join(line.rstrip() for line in lines).strip()
def _pdf_tokens(stream: bytes) -> list[str | tuple[str, str]]:
tokens: list[str | tuple[str, str]] = []
index = 0
while index < len(stream):
char = stream[index:index + 1]
if char in b" \t\r\n\f\x00":
index += 1
continue
if char == b"%":
newline = stream.find(b"\n", index)
index = len(stream) if newline < 0 else newline + 1
continue
if char == b"(":
text, index = _read_literal_string(stream, index)
tokens.append(("text", text))
continue
if char == b"<" and stream[index:index + 2] != b"<<":
end = stream.find(b">", index + 1)
if end < 0:
break
tokens.append(("text", _decode_hex(stream[index + 1:end])))
index = end + 1
continue
if char in b"[]":
tokens.append(char.decode("ascii"))
index += 1
continue
end = index
while end < len(stream) and stream[end:end + 1] not in b" \t\r\n\f\x00[]()<>/%":
end += 1
if end == index:
index += 1
continue
tokens.append(stream[index:end].decode("latin-1", errors="ignore"))
index = end
return tokens
def _read_literal_string(data: bytes, start: int) -> tuple[str, int]:
depth = 1
index = start + 1
output = bytearray()
while index < len(data) and depth > 0:
char = data[index]
if char == 0x5C:
escaped, index = _read_escape(data, index + 1)
output.extend(escaped)
continue
if char == 0x28:
depth += 1
output.append(char)
elif char == 0x29:
depth -= 1
if depth:
output.append(char)
else:
output.append(char)
index += 1
return output.decode("utf-8", errors="replace"), index
def _read_escape(data: bytes, index: int) -> tuple[bytes, int]:
if index >= len(data):
return b"\\", index
char = data[index]
escapes = {
ord("n"): b"\n",
ord("r"): b"\r",
ord("t"): b"\t",
ord("b"): b"\b",
ord("f"): b"\f",
ord("("): b"(",
ord(")"): b")",
ord("\\"): b"\\",
}
if char in escapes:
return escapes[char], index + 1
if char in b"\r\n":
if char == ord("\r") and index + 1 < len(data) and data[index + 1] == ord("\n"):
return b"", index + 2
return b"", index + 1
if 0x30 <= char <= 0x37:
end = index + 1
while end < min(index + 3, len(data)) and 0x30 <= data[end] <= 0x37:
end += 1
return bytes([int(data[index:end], 8)]), end
return bytes([char]), index + 1
def _decode_hex(value: bytes) -> str:
cleaned = re.sub(rb"\s+", b"", value)
if len(cleaned) % 2:
cleaned += b"0"
try:
return bytes.fromhex(cleaned.decode("ascii")).decode("utf-8", errors="replace")
except ValueError:
return ""
def _previous_text(tokens: list[str | tuple[str, str]], index: int) -> str:
cursor = index - 1
while cursor >= 0:
token = tokens[cursor]
if isinstance(token, tuple) and token[0] == "text":
return token[1]
if token in {"Tj", "TJ", "'", '"', "T*", "Td", "TD"}:
break
cursor -= 1
return ""
def _td_moves_to_new_line(tokens: list[str | tuple[str, str]], index: int) -> bool:
if index < 2:
return False
y = tokens[index - 1]
try:
return isinstance(y, str) and float(y) < 0
except ValueError:
return False
def _append_text(lines: list[str], text: str) -> None:
if text:
lines[-1] += text
def _new_line(lines: list[str]) -> None:
if lines[-1]:
lines.append("")
def _select_pages(
pages: list[PdfPage],
page_range: Any,
asset: SourceAsset,
) -> tuple[list[PdfPage], list[Diagnostic]]:
if page_range in (None, ""):
return pages, []
selected_numbers, diagnostics = _parse_page_range(page_range, len(pages), asset)
selected = [page for page in pages if page.number in selected_numbers]
return selected, diagnostics
def _parse_page_range(
value: Any,
page_count: int,
asset: SourceAsset,
) -> tuple[set[int], list[Diagnostic]]:
diagnostics: list[Diagnostic] = []
selected: set[int] = set()
if isinstance(value, int):
value = str(value)
if isinstance(value, (list, tuple)):
value = ",".join(str(item) for item in value)
if not isinstance(value, str):
return set(), [
_pdf_error(
asset,
"source.pdf.invalid_page_range",
"PDF page_range option must be a string, integer, or list of integers.",
)
]
for part in value.split(","):
part = part.strip()
if not part:
continue
if "-" in part:
start_text, end_text = part.split("-", 1)
if not start_text.isdigit() or not end_text.isdigit():
diagnostics.append(_invalid_page_range(asset, value))
continue
start, end = int(start_text), int(end_text)
if start > end:
diagnostics.append(_invalid_page_range(asset, value))
continue
selected.update(range(start, end + 1))
elif part.isdigit():
selected.add(int(part))
else:
diagnostics.append(_invalid_page_range(asset, value))
out_of_range = sorted(page for page in selected if page < 1 or page > page_count)
if out_of_range:
diagnostics.append(
_pdf_error(
asset,
"source.pdf.page_range_out_of_bounds",
"PDF page_range selects pages outside the document.",
details={"page_range": value, "page_count": page_count, "pages": out_of_range},
)
)
selected = {page for page in selected if 1 <= page <= page_count}
if not selected and not diagnostics:
diagnostics.append(_invalid_page_range(asset, value))
return selected, diagnostics
def _invalid_page_range(asset: SourceAsset, value: str) -> Diagnostic:
return _pdf_error(
asset,
"source.pdf.invalid_page_range",
"PDF page_range option is invalid.",
details={"page_range": value},
)
def _page_may_be_image_only(
page_body: bytes,
objects: dict[int, bytes],
content_ids: list[int],
) -> bool:
haystack = page_body + b"\n" + b"\n".join(objects.get(ref, b"") for ref in content_ids)
return bool(re.search(rb"/Subtype\s*/Image\b|\bDo\b", haystack))
def _normalize_markdown(text: str, normalize_whitespace: bool) -> str:
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
if normalize_whitespace:
lines = [re.sub(r"[ \t]+", " ", line).strip() for line in lines]
else:
lines = [line.rstrip() for line in lines]
cleaned: list[str] = []
blank = False
for line in lines:
if not line:
if not blank and cleaned:
cleaned.append("")
blank = True
continue
cleaned.append(line)
blank = False
return "\n".join(cleaned).strip()
def _document_id(asset: SourceAsset, metadata: SourceMetadata) -> str:
title = metadata.title
if title:
slug = re.sub(r"[^a-z0-9._-]+", "-", title.lower()).strip("-")
if slug:
return f"source.pdf:{slug}"
if asset.digest:
return f"source.pdf:{asset.digest.removeprefix('sha256:')}"
return f"source.pdf:{asset.name or asset.uri}"
def _adapter_info(options: dict[str, Any]) -> dict[str, Any]:
return {
"id": "source.pdf",
"version": "1",
"options": options,
}
def _confidence(package: PdfPackage, diagnostics: list[Diagnostic]) -> float:
if has_error(diagnostics):
return 0.0
if not package.pages:
return 0.0
coverage = sum(1 for page in package.pages if page.text.strip()) / len(package.pages)
return max(0.1, 0.75 * coverage)
def _warning(
asset: SourceAsset,
code: str,
message: str,
*,
details: dict[str, Any] | None = None,
) -> Diagnostic:
return Diagnostic(
severity="warning",
code=code,
message=message,
source=SourceLocation(path=asset.path) if asset.path else None,
details=details or {},
)
def _pdf_error(
asset: SourceAsset,
code: str,
message: str,
*,
details: dict[str, Any] | None = None,
) -> Diagnostic:
return Diagnostic(
severity="error",
code=code,
message=message,
source=SourceLocation(path=asset.path) if asset.path else None,
details=details or {},
)
def _malformed(
asset: SourceAsset,
message: str,
details: dict[str, Any] | None = None,
) -> Diagnostic:
return _pdf_error(asset, "source.malformed", message, details=details)
def _warning_count(diagnostics: list[Diagnostic]) -> int:
return sum(1 for diagnostic in diagnostics if diagnostic.severity == "warning")
def _clean_text(text: str) -> str:
cleaned = re.sub(r"\s+", " ", text).strip()
return re.sub(r"\s+([.,;:!?])", r"\1", cleaned)

242
tests/test_pdf_adapter.py Normal file
View File

@@ -0,0 +1,242 @@
from pathlib import Path
from markitect_tool.source import (
SourceAdapterMatchRequest,
SourceAdapterRegistry,
SourceAsset,
SourceInspectRequest,
SourceReadRequest,
discover_source_adapters,
inspect_source,
normalize_source,
)
from markitect_filter.adapters import pdf_adapter_descriptor
class FakeEntryPoint:
name = "pdf"
def load(self):
return pdf_adapter_descriptor
def test_pdf_descriptor_matches_contract():
descriptor = pdf_adapter_descriptor()
assert descriptor.id == "source.pdf"
assert descriptor.operations == ["read"]
assert descriptor.media_types == ["application/pdf"]
assert descriptor.extensions == [".pdf"]
assert descriptor.safety["network"] is False
assert descriptor.safety["external_process"] is False
assert descriptor.option_schema["properties"]["include_page_breaks"]["default"] is False
assert descriptor.metadata["dependency_profile"] == "stdlib"
def test_pdf_adapter_matches_pdf_assets(tmp_path: Path):
pdf_path = _write_pdf(tmp_path)
asset = SourceAsset.from_path(pdf_path, media_type="application/pdf")
adapter = pdf_adapter_descriptor().instantiate()
match = adapter.can_read(SourceAdapterMatchRequest(asset=asset))
assert match.matched
assert match.confidence == 100
def test_pdf_adapter_inspects_metadata(tmp_path: Path):
pdf_path = _write_pdf(tmp_path)
asset = SourceAsset.from_path(pdf_path, media_type="application/pdf")
adapter = pdf_adapter_descriptor().instantiate()
result = adapter.inspect(SourceInspectRequest(asset=asset))
assert result.is_valid
assert result.metadata.title == "PDF Fixture"
assert result.metadata.creators == ["Ada Lovelace"]
assert result.metadata.publication_date == "D:20260514093000Z"
assert result.metadata.raw["subject"] == "Source Adapter Test"
assert result.quality.lossiness == "medium"
assert result.quality.metadata["page_count"] == 2
assert result.quality.metadata["pages_with_text"] == 2
def test_pdf_adapter_normalizes_pages_to_markdown(tmp_path: Path):
pdf_path = _write_pdf(tmp_path)
asset = SourceAsset.from_path(pdf_path, media_type="application/pdf")
adapter = pdf_adapter_descriptor().instantiate()
result = adapter.read(SourceReadRequest(asset=asset))
assert result.is_valid
assert result.document is not None
assert result.document.document_id == "source.pdf:pdf-fixture"
assert result.document.markdown == "Hello PDF\nSecond line\n\nPage two text."
assert [segment.segment_id for segment in result.document.segments] == [
"page-0001",
"page-0002",
]
assert result.document.segments[0].provenance[0].page == "1"
assert result.document.quality.lossiness == "low"
assert result.document.quality.metadata["page_coverage"] == 1.0
def test_pdf_adapter_applies_page_range_and_page_markers(tmp_path: Path):
pdf_path = _write_pdf(tmp_path)
asset = SourceAsset.from_path(pdf_path, media_type="application/pdf")
adapter = pdf_adapter_descriptor().instantiate()
result = adapter.read(
SourceReadRequest(
asset=asset,
options={"page_range": "2", "include_page_breaks": True},
)
)
assert result.is_valid
assert result.document is not None
assert result.document.markdown == "<!-- page: 2 -->\n\nPage two text."
assert result.document.segments[0].metadata["page"] == 2
def test_markitect_api_can_use_pdf_registry(tmp_path: Path):
pdf_path = _write_pdf(tmp_path)
registry = SourceAdapterRegistry([pdf_adapter_descriptor()])
inspected = inspect_source(pdf_path, registry=registry)
normalized = normalize_source(pdf_path, registry=registry)
assert inspected.is_valid
assert inspected.metadata.title == "PDF Fixture"
assert normalized.is_valid
assert normalized.document is not None
assert normalized.document.segments[1].markdown == "Page two text."
def test_pdf_adapter_reports_malformed_pdf(tmp_path: Path):
pdf_path = tmp_path / "broken.pdf"
pdf_path.write_bytes(b"not a pdf")
asset = SourceAsset.from_path(pdf_path, media_type="application/pdf")
adapter = pdf_adapter_descriptor().instantiate()
result = adapter.read(SourceReadRequest(asset=asset))
assert not result.is_valid
assert result.diagnostics[0].code == "source.malformed"
def test_pdf_adapter_reports_encrypted_pdf(tmp_path: Path):
pdf_path = _write_pdf(tmp_path, encrypted=True)
asset = SourceAsset.from_path(pdf_path, media_type="application/pdf")
adapter = pdf_adapter_descriptor().instantiate()
result = adapter.read(SourceReadRequest(asset=asset))
assert not result.is_valid
assert result.diagnostics[0].code == "source.pdf.encrypted"
def test_pdf_entry_point_discovery_shape():
registry = discover_source_adapters([FakeEntryPoint()])
assert registry.get("source.pdf").name == "PDF"
def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path:
pdf_path = tmp_path / ("encrypted.pdf" if encrypted else "fixture.pdf")
objects: list[tuple[int, bytes]] = []
page_refs = []
next_id = 3
for page_number, lines in enumerate(
[
["Hello PDF", "Second line"],
["Page two text."],
],
start=1,
):
page_id = next_id
content_id = next_id + 1
next_id += 2
page_refs.append(f"{page_id} 0 R")
stream = _page_stream(lines)
objects.append(
(
page_id,
(
f"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] "
f"/Resources << /Font << /F1 7 0 R >> >> /Contents {content_id} 0 R >>"
).encode("ascii"),
)
)
objects.append(
(
content_id,
b"<< /Length "
+ str(len(stream)).encode("ascii")
+ b" >>\nstream\n"
+ stream
+ b"\nendstream",
)
)
objects.extend(
[
(1, b"<< /Type /Catalog /Pages 2 0 R >>"),
(
2,
(
f"<< /Type /Pages /Kids [{' '.join(page_refs)}] "
f"/Count {len(page_refs)} >>"
).encode("ascii"),
),
(7, b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>"),
(
8,
b"<< /Title (PDF Fixture) /Author (Ada Lovelace) "
b"/Subject (Source Adapter Test) /Keywords (markitect pdf) "
b"/Producer (markitect-filter tests) /CreationDate (D:20260514093000Z) >>",
),
]
)
if encrypted:
objects.append((9, b"<< /Filter /Standard /V 1 /R 2 >>"))
objects.sort(key=lambda item: item[0])
header = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n"
content = bytearray(header)
max_id = max(object_id for object_id, _ in objects)
offsets = {0: 0}
for object_id, body in objects:
offsets[object_id] = len(content)
content.extend(f"{object_id} 0 obj\n".encode("ascii"))
content.extend(body)
content.extend(b"\nendobj\n")
xref_offset = len(content)
content.extend(f"xref\n0 {max_id + 1}\n".encode("ascii"))
content.extend(b"0000000000 65535 f \n")
for object_id in range(1, max_id + 1):
content.extend(f"{offsets.get(object_id, 0):010d} 00000 n \n".encode("ascii"))
trailer = f"trailer\n<< /Size {max_id + 1} /Root 1 0 R /Info 8 0 R".encode("ascii")
if encrypted:
trailer += b" /Encrypt 9 0 R"
trailer += b" >>\n"
content.extend(trailer)
content.extend(f"startxref\n{xref_offset}\n%%EOF\n".encode("ascii"))
pdf_path.write_bytes(bytes(content))
return pdf_path
def _page_stream(lines: list[str]) -> bytes:
parts = ["BT", "/F1 12 Tf", "72 720 Td"]
for index, line in enumerate(lines):
if index:
parts.append("T*")
parts.append(f"({_pdf_literal(line)}) Tj")
parts.append("ET")
return "\n".join(parts).encode("ascii")
def _pdf_literal(text: str) -> str:
return text.replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)")

View File

@@ -3,10 +3,10 @@ id: MKTF-WP-0002
type: workplan
title: "PDF Read Adapter"
domain: markitect
status: todo
status: done
owner: markitect-filter
topic_slug: markitect
planning_priority: P1
planning_priority: complete
planning_order: 20
depends_on_workplans:
- MKTF-WP-0001
@@ -32,7 +32,7 @@ The first PDF slice should target deterministic text extraction from
digitally-readable PDFs. It should preserve page-level provenance and make
extraction uncertainty visible through diagnostics and quality signals.
## Planned Scope
## Implemented Scope
- Optional PDF dependency profile isolated behind a `pdf` extra.
- Entry point group registration:
@@ -72,7 +72,7 @@ extraction uncertainty visible through diagnostics and quality signals.
```task
id: MKTF-WP-0002-T001
status: todo
status: done
priority: high
state_hub_task_id: "2ce51bb9-9182-4927-90d1-4c08433b5ddb"
```
@@ -91,11 +91,16 @@ The decision should document:
Output: dependency decision, option contract, and implementation notes.
Implemented: `docs/pdf-adapter.md`, `pyproject.toml`, and the descriptor
metadata document a stdlib first slice, a reserved `pdf` extra, local
digitally-readable PDF support, page range/page marker/whitespace options, and
deferred OCR/layout-heavy backends.
## P2.2 - Add descriptor and entry point registration
```task
id: MKTF-WP-0002-T002
status: todo
status: done
priority: high
state_hub_task_id: "27d754a9-59ae-4419-946b-f1f847bd3b10"
```
@@ -116,11 +121,15 @@ The descriptor should define:
Output: descriptor, entry point registration, and descriptor tests.
Implemented: `pdf_adapter_descriptor` is registered through
`markitect_tool.source_adapters`, exported from the package, and covered by
descriptor and discovery tests.
## P2.3 - Implement PDF inspection
```task
id: MKTF-WP-0002-T003
status: todo
status: done
priority: high
state_hub_task_id: "33b594e6-d12a-46d5-bc50-6ec1aebaaf65"
```
@@ -138,11 +147,15 @@ Inspection should report:
Output: inspection implementation and tests with small fixtures.
Implemented: `PdfReadAdapter.inspect` reports metadata, page count,
extractability signals, encryption status, quality metadata, and malformed or
encrypted diagnostics using deterministic generated fixtures.
## P2.4 - Normalize page text into Markitect Markdown
```task
id: MKTF-WP-0002-T004
status: todo
status: done
priority: high
state_hub_task_id: "30c0c777-a4e4-43d1-ac24-6a0f84c7b761"
```
@@ -162,11 +175,15 @@ Normalization should:
Output: read implementation and normalization tests.
Implemented: `PdfReadAdapter.read` extracts ordered page text into stable
page segments, applies page ranges, supports optional page markers, preserves
page provenance, and uses the Markitect cache-key helpers.
## P2.5 - Add diagnostics and quality semantics
```task
id: MKTF-WP-0002-T005
status: todo
status: done
priority: high
state_hub_task_id: "8b6a190a-350b-4c61-ac4f-1900673a8cd2"
```
@@ -188,11 +205,17 @@ skipped pages, lossiness, and confidence.
Output: diagnostic helpers, quality rules, and tests.
Implemented: PDF diagnostics cover malformed files, unreadable files,
encrypted PDFs, invalid page ranges, missing/empty streams, image-only pages,
empty extraction, and stream decompression failures. Quality metadata records
backend, page count, selected pages, extracted pages, coverage, warnings, and
skipped pages.
## P2.6 - Add fixtures, docs, and validation
```task
id: MKTF-WP-0002-T006
status: todo
status: done
priority: medium
state_hub_task_id: "af597160-e189-42be-8479-c6e0f467d238"
```
@@ -211,6 +234,11 @@ Validation should cover:
Output: tests, README update, and validation command.
Implemented: generated PDF fixtures and tests cover descriptor shape, matching,
metadata inspection, normalization, page range markers, malformed PDFs,
encrypted PDFs, registry use, entry point discovery, README documentation, and
the validation command below.
## Validation
Run from `markitect-filter`: