feat(source): add pdf read adapter

This commit is contained in:
2026-05-14 23:33:31 +02:00
parent 24ee499b50
commit 0c9a418e85
8 changed files with 1176 additions and 13 deletions

View File

@@ -1,5 +1,5 @@
"""Concrete source-format adapters for Markitect."""
from markitect_filter.adapters import epub3_adapter_descriptor
from markitect_filter.adapters import epub3_adapter_descriptor, pdf_adapter_descriptor
__all__ = ["epub3_adapter_descriptor"]
__all__ = ["epub3_adapter_descriptor", "pdf_adapter_descriptor"]

View File

@@ -49,3 +49,59 @@ def epub3_adapter_descriptor() -> SourceAdapterDescriptor:
"dependency_profile": "stdlib",
},
)
def pdf_adapter_descriptor() -> SourceAdapterDescriptor:
"""Return the lightweight PDF read adapter descriptor."""
def factory():
from markitect_filter.pdf import PdfReadAdapter
return PdfReadAdapter()
return SourceAdapterDescriptor(
id="source.pdf",
version="1",
name="PDF",
summary="Read digitally-readable PDFs into canonical Markitect Markdown.",
operations=["read"],
media_types=["application/pdf"],
extensions=[".pdf"],
factory=factory,
option_schema={
"type": "object",
"properties": {
"page_range": {
"type": "string",
"description": "Optional 1-based page range such as `1-3,5`.",
},
"include_page_breaks": {
"type": "boolean",
"default": False,
"description": "Prefix each page segment with a Markdown comment page marker.",
},
"normalize_whitespace": {
"type": "boolean",
"default": True,
"description": "Collapse repeated horizontal whitespace while preserving extracted line breaks.",
},
},
"additionalProperties": False,
},
safety={
"reads_files": True,
"writes_files": False,
"network": False,
"external_process": False,
},
quality_profile={
"text_extraction": "stdlib-pdf-text",
"images": "diagnostic-only",
"styles": "ignored",
"tables": "plain-text-only",
},
metadata={
"format": "PDF",
"dependency_profile": "stdlib",
},
)

782
src/markitect_filter/pdf.py Normal file
View File

@@ -0,0 +1,782 @@
"""PDF read adapter implementation."""
from __future__ import annotations
import re
import zlib
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from markitect_tool.diagnostics import Diagnostic, SourceLocation, has_error
from markitect_tool.source import (
NormalizationQuality,
NormalizedMarkdownDocument,
NormalizedMarkdownSegment,
SourceAdapterMatch,
SourceAdapterMatchRequest,
SourceAsset,
SourceInspectRequest,
SourceInspectResult,
SourceMetadata,
SourceProvenance,
SourceReadRequest,
SourceReadResult,
normalization_cache_key,
)
from markitect_filter.adapters import pdf_adapter_descriptor
PDF_HEADER_RE = re.compile(rb"%PDF-\d\.\d")
OBJECT_RE = re.compile(rb"(\d+)\s+(\d+)\s+obj\b(.*?)\bendobj", re.DOTALL)
STREAM_RE = re.compile(rb"stream\r?\n(.*?)\r?\nendstream", re.DOTALL)
PAGE_TYPE_RE = re.compile(rb"/Type\s*/Page\b")
PAGES_TYPE_RE = re.compile(rb"/Type\s*/Pages\b")
REF_RE = re.compile(rb"(\d+)\s+\d+\s+R")
INFO_REF_RE = re.compile(rb"/Info\s+(\d+)\s+\d+\s+R")
COUNT_RE = re.compile(rb"/Count\s+(\d+)")
@dataclass(frozen=True)
class PdfPage:
number: int
object_id: int
text: str
diagnostics: list[Diagnostic]
@dataclass(frozen=True)
class PdfPackage:
metadata: SourceMetadata
page_count: int
encrypted: bool
pages: list[PdfPage]
diagnostics: list[Diagnostic]
class PdfReadAdapter:
"""Read digitally-readable PDFs into normalized Markitect Markdown."""
def __init__(self) -> None:
self.descriptor = pdf_adapter_descriptor()
def can_read(self, request: SourceAdapterMatchRequest) -> SourceAdapterMatch:
asset = request.asset
if asset.media_type == "application/pdf":
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=True,
confidence=100,
reason="media_type",
)
if asset.extension == ".pdf":
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=True,
confidence=80,
reason="extension",
)
return SourceAdapterMatch(
adapter_id=self.descriptor.id,
matched=False,
confidence=0,
reason="unsupported",
)
def inspect(self, request: SourceInspectRequest) -> SourceInspectResult:
package = _load_pdf(request.asset)
diagnostics = package.diagnostics
extracted_pages = sum(1 for page in package.pages if page.text.strip())
return SourceInspectResult(
asset=request.asset,
adapter=_adapter_info(request.options),
metadata=package.metadata,
capabilities=["read"],
quality=NormalizationQuality(
lossiness="unknown" if has_error(diagnostics) else "medium",
confidence=_confidence(package, diagnostics),
warnings=_warning_count(diagnostics),
metadata={
"extraction": "pdf-stdlib-text",
"page_count": package.page_count,
"pages_with_text": extracted_pages,
"encrypted": package.encrypted,
},
),
diagnostics=diagnostics,
valid=not has_error(diagnostics),
)
def read(self, request: SourceReadRequest) -> SourceReadResult:
package = _load_pdf(request.asset)
if has_error(package.diagnostics):
return SourceReadResult(diagnostics=package.diagnostics, valid=False)
selected_pages, page_range_diagnostics = _select_pages(
package.pages,
request.options.get("page_range"),
request.asset,
)
diagnostics = [*package.diagnostics, *page_range_diagnostics]
if has_error(diagnostics):
return SourceReadResult(diagnostics=diagnostics, valid=False)
normalize_whitespace = bool(request.options.get("normalize_whitespace", True))
include_page_breaks = bool(request.options.get("include_page_breaks", False))
segments: list[NormalizedMarkdownSegment] = []
skipped_pages = 0
for page in selected_pages:
page_text = _normalize_markdown(page.text, normalize_whitespace)
diagnostics.extend(page.diagnostics)
if not page_text:
skipped_pages += 1
diagnostics.append(
_warning(
request.asset,
"source.pdf.empty_page",
f"PDF page {page.number} did not produce extractable text.",
details={"page": page.number},
)
)
continue
markdown = (
f"<!-- page: {page.number} -->\n\n{page_text}"
if include_page_breaks
else page_text
)
segments.append(
NormalizedMarkdownSegment(
segment_id=f"page-{page.number:04d}",
order=len(segments),
markdown=markdown,
provenance=[
SourceProvenance(
source_uri=request.asset.uri,
source_path=request.asset.path,
page=str(page.number),
digest=request.asset.digest,
metadata={"pdf_object": page.object_id},
)
],
metadata={"page": page.number, "pdf_object": page.object_id},
)
)
if not segments:
diagnostics.append(
_pdf_error(
request.asset,
"source.pdf.no_extractable_text",
"PDF did not produce any extractable Markdown text.",
details={"page_count": package.page_count},
)
)
return SourceReadResult(diagnostics=diagnostics, valid=False)
markdown = "\n\n".join(segment.markdown.strip() for segment in segments)
page_coverage = len(segments) / max(len(selected_pages), 1)
warning_count = _warning_count(diagnostics)
quality = NormalizationQuality(
lossiness="medium" if warning_count else "low",
confidence=max(0.1, 0.75 * page_coverage) if not has_error(diagnostics) else 0.0,
skipped_items=skipped_pages,
warnings=warning_count,
metadata={
"extraction": "pdf-stdlib-text",
"page_count": package.page_count,
"selected_pages": [page.number for page in selected_pages],
"pages_extracted": len(segments),
"page_coverage": page_coverage,
},
)
document = NormalizedMarkdownDocument(
document_id=_document_id(request.asset, package.metadata),
asset=request.asset,
metadata=package.metadata,
markdown=markdown,
segments=segments,
quality=quality,
diagnostics=diagnostics,
provenance=[
SourceProvenance(
source_uri=request.asset.uri,
source_path=request.asset.path,
digest=request.asset.digest,
metadata={"page_count": package.page_count},
)
],
adapter=_adapter_info(request.options),
cache_key=normalization_cache_key(
asset=request.asset,
adapter_id=self.descriptor.id,
adapter_version=self.descriptor.version,
options=request.options,
),
)
return SourceReadResult(document=document, diagnostics=[], valid=not has_error(diagnostics))
def _load_pdf(asset: SourceAsset) -> PdfPackage:
diagnostics: list[Diagnostic] = []
try:
data = Path(asset.path or asset.uri).read_bytes()
except OSError as exc:
return PdfPackage(
metadata=SourceMetadata(),
page_count=0,
encrypted=False,
pages=[],
diagnostics=[
_pdf_error(
asset,
"source.pdf.unreadable",
"PDF file could not be read.",
details={"error": str(exc)},
)
],
)
if not PDF_HEADER_RE.match(data.lstrip()):
return PdfPackage(
metadata=SourceMetadata(),
page_count=0,
encrypted=False,
pages=[],
diagnostics=[_malformed(asset, "PDF does not start with a PDF header.")],
)
objects = _parse_objects(data)
encrypted = bool(re.search(rb"/Encrypt\b", data))
metadata = _extract_metadata(data, objects, encrypted)
if encrypted:
return PdfPackage(
metadata=metadata,
page_count=_page_count(objects),
encrypted=True,
pages=[],
diagnostics=[
_pdf_error(
asset,
"source.pdf.encrypted",
"PDF is encrypted or declares an encryption dictionary.",
)
],
)
page_ids = _page_object_ids(objects)
page_count = _page_count(objects) or len(page_ids)
pages: list[PdfPage] = []
for page_number, object_id in enumerate(page_ids, start=1):
page_body = objects[object_id]
page_diagnostics: list[Diagnostic] = []
content_ids = _content_refs(page_body)
text_parts: list[str] = []
if not content_ids and STREAM_RE.search(page_body):
stream = _stream_data(page_body, asset, page_diagnostics)
if stream:
text_parts.append(_extract_stream_text(stream))
for content_id in content_ids:
content_body = objects.get(content_id)
if content_body is None:
page_diagnostics.append(
_warning(
asset,
"source.pdf.missing_content_stream",
f"PDF page {page_number} references missing content object {content_id}.",
details={"page": page_number, "object_id": content_id},
)
)
continue
stream = _stream_data(content_body, asset, page_diagnostics)
if stream:
text_parts.append(_extract_stream_text(stream))
text = "\n".join(part for part in text_parts if part.strip()).strip()
if _page_may_be_image_only(page_body, objects, content_ids) and not text:
page_diagnostics.append(
_warning(
asset,
"source.pdf.image_only_page",
f"PDF page {page_number} appears to contain image content without extractable text.",
details={"page": page_number},
)
)
pages.append(PdfPage(page_number, object_id, text, page_diagnostics))
if not page_ids:
diagnostics.append(_malformed(asset, "PDF does not declare any page objects."))
return PdfPackage(
metadata=metadata,
page_count=page_count,
encrypted=False,
pages=pages,
diagnostics=diagnostics,
)
def _parse_objects(data: bytes) -> dict[int, bytes]:
return {int(match.group(1)): match.group(3) for match in OBJECT_RE.finditer(data)}
def _page_object_ids(objects: dict[int, bytes]) -> list[int]:
return [
object_id
for object_id, body in sorted(objects.items())
if PAGE_TYPE_RE.search(body) and not PAGES_TYPE_RE.search(body)
]
def _page_count(objects: dict[int, bytes]) -> int:
counts = [
int(match.group(1))
for body in objects.values()
if PAGES_TYPE_RE.search(body)
for match in [COUNT_RE.search(body)]
if match is not None
]
return max(counts) if counts else 0
def _content_refs(page_body: bytes) -> list[int]:
refs: list[int] = []
array_match = re.search(rb"/Contents\s*\[(.*?)\]", page_body, re.DOTALL)
if array_match:
refs.extend(int(match.group(1)) for match in REF_RE.finditer(array_match.group(1)))
direct_match = re.search(rb"/Contents\s+(\d+)\s+\d+\s+R", page_body)
if direct_match:
ref = int(direct_match.group(1))
if ref not in refs:
refs.append(ref)
return refs
def _stream_data(
object_body: bytes,
asset: SourceAsset,
diagnostics: list[Diagnostic],
) -> bytes:
match = STREAM_RE.search(object_body)
if match is None:
diagnostics.append(
_warning(
asset,
"source.pdf.missing_stream",
"PDF content object does not contain a readable stream.",
)
)
return b""
stream = match.group(1)
if b"/FlateDecode" in object_body:
try:
return zlib.decompress(stream)
except zlib.error as exc:
diagnostics.append(
_warning(
asset,
"source.pdf.flate_decode_failed",
"PDF FlateDecode stream could not be decompressed.",
details={"error": str(exc)},
)
)
return b""
return stream
def _extract_metadata(
data: bytes,
objects: dict[int, bytes],
encrypted: bool,
) -> SourceMetadata:
info_body = b""
info_ref = INFO_REF_RE.search(data)
if info_ref:
info_body = objects.get(int(info_ref.group(1)), b"")
if not info_body:
info_body = data
title = _metadata_value(info_body, "Title")
author = _metadata_value(info_body, "Author")
creation_date = _metadata_value(info_body, "CreationDate")
modification_date = _metadata_value(info_body, "ModDate")
raw = {
key: value
for key, value in {
"subject": _metadata_value(info_body, "Subject"),
"keywords": _metadata_value(info_body, "Keywords"),
"producer": _metadata_value(info_body, "Producer"),
"creator": _metadata_value(info_body, "Creator"),
"creation_date": creation_date,
"modification_date": modification_date,
"encrypted": encrypted,
}.items()
if value not in (None, "")
}
return SourceMetadata(
title=title,
creators=[author] if author else [],
publication_date=creation_date,
raw=raw,
)
def _metadata_value(body: bytes, key: str) -> str | None:
key_bytes = f"/{key}".encode("ascii")
index = body.find(key_bytes)
if index < 0:
return None
index += len(key_bytes)
while index < len(body) and body[index] in b" \t\r\n":
index += 1
if index >= len(body):
return None
if body[index:index + 1] == b"(":
value, _ = _read_literal_string(body, index)
return _clean_text(value)
if body[index:index + 1] == b"<" and body[index:index + 2] != b"<<":
end = body.find(b">", index + 1)
if end > index:
return _clean_text(_decode_hex(body[index + 1:end]))
return None
def _extract_stream_text(stream: bytes) -> str:
tokens = _pdf_tokens(stream)
lines = [""]
last_array_start: int | None = None
for index, token in enumerate(tokens):
if token == "[":
last_array_start = index
continue
if token == "Tj":
_append_text(lines, _previous_text(tokens, index))
elif token == "TJ":
if last_array_start is not None:
text = "".join(
item[1]
for item in tokens[last_array_start:index]
if isinstance(item, tuple) and item[0] == "text"
)
_append_text(lines, text)
last_array_start = None
elif token == "'":
_new_line(lines)
_append_text(lines, _previous_text(tokens, index))
elif token == '"':
_new_line(lines)
_append_text(lines, _previous_text(tokens, index))
elif token in {"T*", "TD"}:
_new_line(lines)
elif token == "Td" and _td_moves_to_new_line(tokens, index):
_new_line(lines)
return "\n".join(line.rstrip() for line in lines).strip()
def _pdf_tokens(stream: bytes) -> list[str | tuple[str, str]]:
tokens: list[str | tuple[str, str]] = []
index = 0
while index < len(stream):
char = stream[index:index + 1]
if char in b" \t\r\n\f\x00":
index += 1
continue
if char == b"%":
newline = stream.find(b"\n", index)
index = len(stream) if newline < 0 else newline + 1
continue
if char == b"(":
text, index = _read_literal_string(stream, index)
tokens.append(("text", text))
continue
if char == b"<" and stream[index:index + 2] != b"<<":
end = stream.find(b">", index + 1)
if end < 0:
break
tokens.append(("text", _decode_hex(stream[index + 1:end])))
index = end + 1
continue
if char in b"[]":
tokens.append(char.decode("ascii"))
index += 1
continue
end = index
while end < len(stream) and stream[end:end + 1] not in b" \t\r\n\f\x00[]()<>/%":
end += 1
if end == index:
index += 1
continue
tokens.append(stream[index:end].decode("latin-1", errors="ignore"))
index = end
return tokens
def _read_literal_string(data: bytes, start: int) -> tuple[str, int]:
depth = 1
index = start + 1
output = bytearray()
while index < len(data) and depth > 0:
char = data[index]
if char == 0x5C:
escaped, index = _read_escape(data, index + 1)
output.extend(escaped)
continue
if char == 0x28:
depth += 1
output.append(char)
elif char == 0x29:
depth -= 1
if depth:
output.append(char)
else:
output.append(char)
index += 1
return output.decode("utf-8", errors="replace"), index
def _read_escape(data: bytes, index: int) -> tuple[bytes, int]:
if index >= len(data):
return b"\\", index
char = data[index]
escapes = {
ord("n"): b"\n",
ord("r"): b"\r",
ord("t"): b"\t",
ord("b"): b"\b",
ord("f"): b"\f",
ord("("): b"(",
ord(")"): b")",
ord("\\"): b"\\",
}
if char in escapes:
return escapes[char], index + 1
if char in b"\r\n":
if char == ord("\r") and index + 1 < len(data) and data[index + 1] == ord("\n"):
return b"", index + 2
return b"", index + 1
if 0x30 <= char <= 0x37:
end = index + 1
while end < min(index + 3, len(data)) and 0x30 <= data[end] <= 0x37:
end += 1
return bytes([int(data[index:end], 8)]), end
return bytes([char]), index + 1
def _decode_hex(value: bytes) -> str:
cleaned = re.sub(rb"\s+", b"", value)
if len(cleaned) % 2:
cleaned += b"0"
try:
return bytes.fromhex(cleaned.decode("ascii")).decode("utf-8", errors="replace")
except ValueError:
return ""
def _previous_text(tokens: list[str | tuple[str, str]], index: int) -> str:
cursor = index - 1
while cursor >= 0:
token = tokens[cursor]
if isinstance(token, tuple) and token[0] == "text":
return token[1]
if token in {"Tj", "TJ", "'", '"', "T*", "Td", "TD"}:
break
cursor -= 1
return ""
def _td_moves_to_new_line(tokens: list[str | tuple[str, str]], index: int) -> bool:
if index < 2:
return False
y = tokens[index - 1]
try:
return isinstance(y, str) and float(y) < 0
except ValueError:
return False
def _append_text(lines: list[str], text: str) -> None:
if text:
lines[-1] += text
def _new_line(lines: list[str]) -> None:
if lines[-1]:
lines.append("")
def _select_pages(
pages: list[PdfPage],
page_range: Any,
asset: SourceAsset,
) -> tuple[list[PdfPage], list[Diagnostic]]:
if page_range in (None, ""):
return pages, []
selected_numbers, diagnostics = _parse_page_range(page_range, len(pages), asset)
selected = [page for page in pages if page.number in selected_numbers]
return selected, diagnostics
def _parse_page_range(
value: Any,
page_count: int,
asset: SourceAsset,
) -> tuple[set[int], list[Diagnostic]]:
diagnostics: list[Diagnostic] = []
selected: set[int] = set()
if isinstance(value, int):
value = str(value)
if isinstance(value, (list, tuple)):
value = ",".join(str(item) for item in value)
if not isinstance(value, str):
return set(), [
_pdf_error(
asset,
"source.pdf.invalid_page_range",
"PDF page_range option must be a string, integer, or list of integers.",
)
]
for part in value.split(","):
part = part.strip()
if not part:
continue
if "-" in part:
start_text, end_text = part.split("-", 1)
if not start_text.isdigit() or not end_text.isdigit():
diagnostics.append(_invalid_page_range(asset, value))
continue
start, end = int(start_text), int(end_text)
if start > end:
diagnostics.append(_invalid_page_range(asset, value))
continue
selected.update(range(start, end + 1))
elif part.isdigit():
selected.add(int(part))
else:
diagnostics.append(_invalid_page_range(asset, value))
out_of_range = sorted(page for page in selected if page < 1 or page > page_count)
if out_of_range:
diagnostics.append(
_pdf_error(
asset,
"source.pdf.page_range_out_of_bounds",
"PDF page_range selects pages outside the document.",
details={"page_range": value, "page_count": page_count, "pages": out_of_range},
)
)
selected = {page for page in selected if 1 <= page <= page_count}
if not selected and not diagnostics:
diagnostics.append(_invalid_page_range(asset, value))
return selected, diagnostics
def _invalid_page_range(asset: SourceAsset, value: str) -> Diagnostic:
return _pdf_error(
asset,
"source.pdf.invalid_page_range",
"PDF page_range option is invalid.",
details={"page_range": value},
)
def _page_may_be_image_only(
page_body: bytes,
objects: dict[int, bytes],
content_ids: list[int],
) -> bool:
haystack = page_body + b"\n" + b"\n".join(objects.get(ref, b"") for ref in content_ids)
return bool(re.search(rb"/Subtype\s*/Image\b|\bDo\b", haystack))
def _normalize_markdown(text: str, normalize_whitespace: bool) -> str:
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
if normalize_whitespace:
lines = [re.sub(r"[ \t]+", " ", line).strip() for line in lines]
else:
lines = [line.rstrip() for line in lines]
cleaned: list[str] = []
blank = False
for line in lines:
if not line:
if not blank and cleaned:
cleaned.append("")
blank = True
continue
cleaned.append(line)
blank = False
return "\n".join(cleaned).strip()
def _document_id(asset: SourceAsset, metadata: SourceMetadata) -> str:
title = metadata.title
if title:
slug = re.sub(r"[^a-z0-9._-]+", "-", title.lower()).strip("-")
if slug:
return f"source.pdf:{slug}"
if asset.digest:
return f"source.pdf:{asset.digest.removeprefix('sha256:')}"
return f"source.pdf:{asset.name or asset.uri}"
def _adapter_info(options: dict[str, Any]) -> dict[str, Any]:
return {
"id": "source.pdf",
"version": "1",
"options": options,
}
def _confidence(package: PdfPackage, diagnostics: list[Diagnostic]) -> float:
if has_error(diagnostics):
return 0.0
if not package.pages:
return 0.0
coverage = sum(1 for page in package.pages if page.text.strip()) / len(package.pages)
return max(0.1, 0.75 * coverage)
def _warning(
asset: SourceAsset,
code: str,
message: str,
*,
details: dict[str, Any] | None = None,
) -> Diagnostic:
return Diagnostic(
severity="warning",
code=code,
message=message,
source=SourceLocation(path=asset.path) if asset.path else None,
details=details or {},
)
def _pdf_error(
asset: SourceAsset,
code: str,
message: str,
*,
details: dict[str, Any] | None = None,
) -> Diagnostic:
return Diagnostic(
severity="error",
code=code,
message=message,
source=SourceLocation(path=asset.path) if asset.path else None,
details=details or {},
)
def _malformed(
asset: SourceAsset,
message: str,
details: dict[str, Any] | None = None,
) -> Diagnostic:
return _pdf_error(asset, "source.malformed", message, details=details)
def _warning_count(diagnostics: list[Diagnostic]) -> int:
return sum(1 for diagnostic in diagnostics if diagnostic.severity == "warning")
def _clean_text(text: str) -> str:
cleaned = re.sub(r"\s+", " ", text).strip()
return re.sub(r"\s+([.,;:!?])", r"\1", cleaned)