From 0c9a418e85d37dcb9971b186e8ba034f6dbc95e3 Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 14 May 2026 23:33:31 +0200 Subject: [PATCH] feat(source): add pdf read adapter --- README.md | 12 +- docs/pdf-adapter.md | 45 ++ pyproject.toml | 2 + src/markitect_filter/__init__.py | 4 +- src/markitect_filter/adapters.py | 56 ++ src/markitect_filter/pdf.py | 782 +++++++++++++++++++++ tests/test_pdf_adapter.py | 242 +++++++ workplans/MKTF-WP-0002-pdf-read-adapter.md | 46 +- 8 files changed, 1176 insertions(+), 13 deletions(-) create mode 100644 docs/pdf-adapter.md create mode 100644 src/markitect_filter/pdf.py create mode 100644 tests/test_pdf_adapter.py diff --git a/README.md b/README.md index f9370e5..a78f0ba 100644 --- a/README.md +++ b/README.md @@ -3,8 +3,11 @@ `markitect-filter` provides concrete source-format adapters for converting external document formats into canonical Markitect Markdown representations. -The first adapter is `source.epub3`, a read-only EPUB3 adapter that implements -the `markitect-tool` source adapter contract. +The first adapters are read-only source adapters that implement the +`markitect-tool` source adapter contract: + +- `source.epub3` for EPUB3 packages +- `source.pdf` for digitally-readable PDFs ## Development @@ -19,4 +22,9 @@ The EPUB3 adapter is registered through: ```toml [project.entry-points."markitect_tool.source_adapters"] epub3 = "markitect_filter.adapters:epub3_adapter_descriptor" +pdf = "markitect_filter.adapters:pdf_adapter_descriptor" ``` + +The first PDF slice is stdlib-only and targets deterministic text extraction +from local, digitally-readable PDFs. OCR, scanned-document recognition, and +layout-perfect reconstruction are intentionally deferred. diff --git a/docs/pdf-adapter.md b/docs/pdf-adapter.md new file mode 100644 index 0000000..4e96052 --- /dev/null +++ b/docs/pdf-adapter.md @@ -0,0 +1,45 @@ +# PDF Adapter + +`source.pdf` is a read-only Markitect source adapter for local, +digitally-readable PDF files. + +## Dependency Policy + +The first implementation is stdlib-only. The `pdf` optional dependency extra is +present so a richer pure-Python backend can be added later without changing the +adapter boundary or making PDF support mandatory for EPUB3 users. + +The adapter does not use network access, external processes, OCR engines, +native system services, or renderer-specific tooling. + +## Supported Inputs + +- Local files with media type `application/pdf` or extension `.pdf`. +- PDFs with extractable text in page content streams. +- Plain and FlateDecode content streams for the first deterministic slice. + +## Deferred Inputs + +- Scanned or image-only PDFs that require OCR. +- Encrypted or permission-restricted PDFs. +- Pixel-perfect layout reconstruction. +- Table, figure, annotation, form, signature, and attachment extraction. +- PDF writing/export. + +## Options + +- `page_range`: optional 1-based page range such as `1-3,5`. +- `include_page_breaks`: when true, prefixes each page segment with a Markdown + page marker comment. +- `normalize_whitespace`: when true, collapses repeated horizontal whitespace + while preserving extracted line breaks. + +## Provenance And Quality + +The adapter emits one segment per extracted page. Each segment carries +page-level `SourceProvenance` with the source path, source digest, page number, +and originating PDF page object id. + +Quality metadata records the extraction backend, document page count, selected +pages, extracted page count, page coverage, skipped pages, warning count, +lossiness, and confidence. diff --git a/pyproject.toml b/pyproject.toml index 74b1104..b153b54 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -18,9 +18,11 @@ dev = [ "pytest>=8", ] epub3 = [] +pdf = [] [project.entry-points."markitect_tool.source_adapters"] epub3 = "markitect_filter.adapters:epub3_adapter_descriptor" +pdf = "markitect_filter.adapters:pdf_adapter_descriptor" [tool.setuptools.packages.find] where = ["src"] diff --git a/src/markitect_filter/__init__.py b/src/markitect_filter/__init__.py index 3ee7b74..550b48b 100644 --- a/src/markitect_filter/__init__.py +++ b/src/markitect_filter/__init__.py @@ -1,5 +1,5 @@ """Concrete source-format adapters for Markitect.""" -from markitect_filter.adapters import epub3_adapter_descriptor +from markitect_filter.adapters import epub3_adapter_descriptor, pdf_adapter_descriptor -__all__ = ["epub3_adapter_descriptor"] +__all__ = ["epub3_adapter_descriptor", "pdf_adapter_descriptor"] diff --git a/src/markitect_filter/adapters.py b/src/markitect_filter/adapters.py index 29e236c..940b27c 100644 --- a/src/markitect_filter/adapters.py +++ b/src/markitect_filter/adapters.py @@ -49,3 +49,59 @@ def epub3_adapter_descriptor() -> SourceAdapterDescriptor: "dependency_profile": "stdlib", }, ) + + +def pdf_adapter_descriptor() -> SourceAdapterDescriptor: + """Return the lightweight PDF read adapter descriptor.""" + + def factory(): + from markitect_filter.pdf import PdfReadAdapter + + return PdfReadAdapter() + + return SourceAdapterDescriptor( + id="source.pdf", + version="1", + name="PDF", + summary="Read digitally-readable PDFs into canonical Markitect Markdown.", + operations=["read"], + media_types=["application/pdf"], + extensions=[".pdf"], + factory=factory, + option_schema={ + "type": "object", + "properties": { + "page_range": { + "type": "string", + "description": "Optional 1-based page range such as `1-3,5`.", + }, + "include_page_breaks": { + "type": "boolean", + "default": False, + "description": "Prefix each page segment with a Markdown comment page marker.", + }, + "normalize_whitespace": { + "type": "boolean", + "default": True, + "description": "Collapse repeated horizontal whitespace while preserving extracted line breaks.", + }, + }, + "additionalProperties": False, + }, + safety={ + "reads_files": True, + "writes_files": False, + "network": False, + "external_process": False, + }, + quality_profile={ + "text_extraction": "stdlib-pdf-text", + "images": "diagnostic-only", + "styles": "ignored", + "tables": "plain-text-only", + }, + metadata={ + "format": "PDF", + "dependency_profile": "stdlib", + }, + ) diff --git a/src/markitect_filter/pdf.py b/src/markitect_filter/pdf.py new file mode 100644 index 0000000..e1af289 --- /dev/null +++ b/src/markitect_filter/pdf.py @@ -0,0 +1,782 @@ +"""PDF read adapter implementation.""" + +from __future__ import annotations + +import re +import zlib +from dataclasses import dataclass +from pathlib import Path +from typing import Any + +from markitect_tool.diagnostics import Diagnostic, SourceLocation, has_error +from markitect_tool.source import ( + NormalizationQuality, + NormalizedMarkdownDocument, + NormalizedMarkdownSegment, + SourceAdapterMatch, + SourceAdapterMatchRequest, + SourceAsset, + SourceInspectRequest, + SourceInspectResult, + SourceMetadata, + SourceProvenance, + SourceReadRequest, + SourceReadResult, + normalization_cache_key, +) + +from markitect_filter.adapters import pdf_adapter_descriptor + + +PDF_HEADER_RE = re.compile(rb"%PDF-\d\.\d") +OBJECT_RE = re.compile(rb"(\d+)\s+(\d+)\s+obj\b(.*?)\bendobj", re.DOTALL) +STREAM_RE = re.compile(rb"stream\r?\n(.*?)\r?\nendstream", re.DOTALL) +PAGE_TYPE_RE = re.compile(rb"/Type\s*/Page\b") +PAGES_TYPE_RE = re.compile(rb"/Type\s*/Pages\b") +REF_RE = re.compile(rb"(\d+)\s+\d+\s+R") +INFO_REF_RE = re.compile(rb"/Info\s+(\d+)\s+\d+\s+R") +COUNT_RE = re.compile(rb"/Count\s+(\d+)") + + +@dataclass(frozen=True) +class PdfPage: + number: int + object_id: int + text: str + diagnostics: list[Diagnostic] + + +@dataclass(frozen=True) +class PdfPackage: + metadata: SourceMetadata + page_count: int + encrypted: bool + pages: list[PdfPage] + diagnostics: list[Diagnostic] + + +class PdfReadAdapter: + """Read digitally-readable PDFs into normalized Markitect Markdown.""" + + def __init__(self) -> None: + self.descriptor = pdf_adapter_descriptor() + + def can_read(self, request: SourceAdapterMatchRequest) -> SourceAdapterMatch: + asset = request.asset + if asset.media_type == "application/pdf": + return SourceAdapterMatch( + adapter_id=self.descriptor.id, + matched=True, + confidence=100, + reason="media_type", + ) + if asset.extension == ".pdf": + return SourceAdapterMatch( + adapter_id=self.descriptor.id, + matched=True, + confidence=80, + reason="extension", + ) + return SourceAdapterMatch( + adapter_id=self.descriptor.id, + matched=False, + confidence=0, + reason="unsupported", + ) + + def inspect(self, request: SourceInspectRequest) -> SourceInspectResult: + package = _load_pdf(request.asset) + diagnostics = package.diagnostics + extracted_pages = sum(1 for page in package.pages if page.text.strip()) + return SourceInspectResult( + asset=request.asset, + adapter=_adapter_info(request.options), + metadata=package.metadata, + capabilities=["read"], + quality=NormalizationQuality( + lossiness="unknown" if has_error(diagnostics) else "medium", + confidence=_confidence(package, diagnostics), + warnings=_warning_count(diagnostics), + metadata={ + "extraction": "pdf-stdlib-text", + "page_count": package.page_count, + "pages_with_text": extracted_pages, + "encrypted": package.encrypted, + }, + ), + diagnostics=diagnostics, + valid=not has_error(diagnostics), + ) + + def read(self, request: SourceReadRequest) -> SourceReadResult: + package = _load_pdf(request.asset) + if has_error(package.diagnostics): + return SourceReadResult(diagnostics=package.diagnostics, valid=False) + + selected_pages, page_range_diagnostics = _select_pages( + package.pages, + request.options.get("page_range"), + request.asset, + ) + diagnostics = [*package.diagnostics, *page_range_diagnostics] + if has_error(diagnostics): + return SourceReadResult(diagnostics=diagnostics, valid=False) + + normalize_whitespace = bool(request.options.get("normalize_whitespace", True)) + include_page_breaks = bool(request.options.get("include_page_breaks", False)) + segments: list[NormalizedMarkdownSegment] = [] + skipped_pages = 0 + for page in selected_pages: + page_text = _normalize_markdown(page.text, normalize_whitespace) + diagnostics.extend(page.diagnostics) + if not page_text: + skipped_pages += 1 + diagnostics.append( + _warning( + request.asset, + "source.pdf.empty_page", + f"PDF page {page.number} did not produce extractable text.", + details={"page": page.number}, + ) + ) + continue + markdown = ( + f"\n\n{page_text}" + if include_page_breaks + else page_text + ) + segments.append( + NormalizedMarkdownSegment( + segment_id=f"page-{page.number:04d}", + order=len(segments), + markdown=markdown, + provenance=[ + SourceProvenance( + source_uri=request.asset.uri, + source_path=request.asset.path, + page=str(page.number), + digest=request.asset.digest, + metadata={"pdf_object": page.object_id}, + ) + ], + metadata={"page": page.number, "pdf_object": page.object_id}, + ) + ) + + if not segments: + diagnostics.append( + _pdf_error( + request.asset, + "source.pdf.no_extractable_text", + "PDF did not produce any extractable Markdown text.", + details={"page_count": package.page_count}, + ) + ) + return SourceReadResult(diagnostics=diagnostics, valid=False) + + markdown = "\n\n".join(segment.markdown.strip() for segment in segments) + page_coverage = len(segments) / max(len(selected_pages), 1) + warning_count = _warning_count(diagnostics) + quality = NormalizationQuality( + lossiness="medium" if warning_count else "low", + confidence=max(0.1, 0.75 * page_coverage) if not has_error(diagnostics) else 0.0, + skipped_items=skipped_pages, + warnings=warning_count, + metadata={ + "extraction": "pdf-stdlib-text", + "page_count": package.page_count, + "selected_pages": [page.number for page in selected_pages], + "pages_extracted": len(segments), + "page_coverage": page_coverage, + }, + ) + document = NormalizedMarkdownDocument( + document_id=_document_id(request.asset, package.metadata), + asset=request.asset, + metadata=package.metadata, + markdown=markdown, + segments=segments, + quality=quality, + diagnostics=diagnostics, + provenance=[ + SourceProvenance( + source_uri=request.asset.uri, + source_path=request.asset.path, + digest=request.asset.digest, + metadata={"page_count": package.page_count}, + ) + ], + adapter=_adapter_info(request.options), + cache_key=normalization_cache_key( + asset=request.asset, + adapter_id=self.descriptor.id, + adapter_version=self.descriptor.version, + options=request.options, + ), + ) + return SourceReadResult(document=document, diagnostics=[], valid=not has_error(diagnostics)) + + +def _load_pdf(asset: SourceAsset) -> PdfPackage: + diagnostics: list[Diagnostic] = [] + try: + data = Path(asset.path or asset.uri).read_bytes() + except OSError as exc: + return PdfPackage( + metadata=SourceMetadata(), + page_count=0, + encrypted=False, + pages=[], + diagnostics=[ + _pdf_error( + asset, + "source.pdf.unreadable", + "PDF file could not be read.", + details={"error": str(exc)}, + ) + ], + ) + + if not PDF_HEADER_RE.match(data.lstrip()): + return PdfPackage( + metadata=SourceMetadata(), + page_count=0, + encrypted=False, + pages=[], + diagnostics=[_malformed(asset, "PDF does not start with a PDF header.")], + ) + + objects = _parse_objects(data) + encrypted = bool(re.search(rb"/Encrypt\b", data)) + metadata = _extract_metadata(data, objects, encrypted) + if encrypted: + return PdfPackage( + metadata=metadata, + page_count=_page_count(objects), + encrypted=True, + pages=[], + diagnostics=[ + _pdf_error( + asset, + "source.pdf.encrypted", + "PDF is encrypted or declares an encryption dictionary.", + ) + ], + ) + + page_ids = _page_object_ids(objects) + page_count = _page_count(objects) or len(page_ids) + pages: list[PdfPage] = [] + for page_number, object_id in enumerate(page_ids, start=1): + page_body = objects[object_id] + page_diagnostics: list[Diagnostic] = [] + content_ids = _content_refs(page_body) + text_parts: list[str] = [] + if not content_ids and STREAM_RE.search(page_body): + stream = _stream_data(page_body, asset, page_diagnostics) + if stream: + text_parts.append(_extract_stream_text(stream)) + for content_id in content_ids: + content_body = objects.get(content_id) + if content_body is None: + page_diagnostics.append( + _warning( + asset, + "source.pdf.missing_content_stream", + f"PDF page {page_number} references missing content object {content_id}.", + details={"page": page_number, "object_id": content_id}, + ) + ) + continue + stream = _stream_data(content_body, asset, page_diagnostics) + if stream: + text_parts.append(_extract_stream_text(stream)) + text = "\n".join(part for part in text_parts if part.strip()).strip() + if _page_may_be_image_only(page_body, objects, content_ids) and not text: + page_diagnostics.append( + _warning( + asset, + "source.pdf.image_only_page", + f"PDF page {page_number} appears to contain image content without extractable text.", + details={"page": page_number}, + ) + ) + pages.append(PdfPage(page_number, object_id, text, page_diagnostics)) + + if not page_ids: + diagnostics.append(_malformed(asset, "PDF does not declare any page objects.")) + + return PdfPackage( + metadata=metadata, + page_count=page_count, + encrypted=False, + pages=pages, + diagnostics=diagnostics, + ) + + +def _parse_objects(data: bytes) -> dict[int, bytes]: + return {int(match.group(1)): match.group(3) for match in OBJECT_RE.finditer(data)} + + +def _page_object_ids(objects: dict[int, bytes]) -> list[int]: + return [ + object_id + for object_id, body in sorted(objects.items()) + if PAGE_TYPE_RE.search(body) and not PAGES_TYPE_RE.search(body) + ] + + +def _page_count(objects: dict[int, bytes]) -> int: + counts = [ + int(match.group(1)) + for body in objects.values() + if PAGES_TYPE_RE.search(body) + for match in [COUNT_RE.search(body)] + if match is not None + ] + return max(counts) if counts else 0 + + +def _content_refs(page_body: bytes) -> list[int]: + refs: list[int] = [] + array_match = re.search(rb"/Contents\s*\[(.*?)\]", page_body, re.DOTALL) + if array_match: + refs.extend(int(match.group(1)) for match in REF_RE.finditer(array_match.group(1))) + direct_match = re.search(rb"/Contents\s+(\d+)\s+\d+\s+R", page_body) + if direct_match: + ref = int(direct_match.group(1)) + if ref not in refs: + refs.append(ref) + return refs + + +def _stream_data( + object_body: bytes, + asset: SourceAsset, + diagnostics: list[Diagnostic], +) -> bytes: + match = STREAM_RE.search(object_body) + if match is None: + diagnostics.append( + _warning( + asset, + "source.pdf.missing_stream", + "PDF content object does not contain a readable stream.", + ) + ) + return b"" + stream = match.group(1) + if b"/FlateDecode" in object_body: + try: + return zlib.decompress(stream) + except zlib.error as exc: + diagnostics.append( + _warning( + asset, + "source.pdf.flate_decode_failed", + "PDF FlateDecode stream could not be decompressed.", + details={"error": str(exc)}, + ) + ) + return b"" + return stream + + +def _extract_metadata( + data: bytes, + objects: dict[int, bytes], + encrypted: bool, +) -> SourceMetadata: + info_body = b"" + info_ref = INFO_REF_RE.search(data) + if info_ref: + info_body = objects.get(int(info_ref.group(1)), b"") + if not info_body: + info_body = data + + title = _metadata_value(info_body, "Title") + author = _metadata_value(info_body, "Author") + creation_date = _metadata_value(info_body, "CreationDate") + modification_date = _metadata_value(info_body, "ModDate") + raw = { + key: value + for key, value in { + "subject": _metadata_value(info_body, "Subject"), + "keywords": _metadata_value(info_body, "Keywords"), + "producer": _metadata_value(info_body, "Producer"), + "creator": _metadata_value(info_body, "Creator"), + "creation_date": creation_date, + "modification_date": modification_date, + "encrypted": encrypted, + }.items() + if value not in (None, "") + } + return SourceMetadata( + title=title, + creators=[author] if author else [], + publication_date=creation_date, + raw=raw, + ) + + +def _metadata_value(body: bytes, key: str) -> str | None: + key_bytes = f"/{key}".encode("ascii") + index = body.find(key_bytes) + if index < 0: + return None + index += len(key_bytes) + while index < len(body) and body[index] in b" \t\r\n": + index += 1 + if index >= len(body): + return None + if body[index:index + 1] == b"(": + value, _ = _read_literal_string(body, index) + return _clean_text(value) + if body[index:index + 1] == b"<" and body[index:index + 2] != b"<<": + end = body.find(b">", index + 1) + if end > index: + return _clean_text(_decode_hex(body[index + 1:end])) + return None + + +def _extract_stream_text(stream: bytes) -> str: + tokens = _pdf_tokens(stream) + lines = [""] + last_array_start: int | None = None + for index, token in enumerate(tokens): + if token == "[": + last_array_start = index + continue + if token == "Tj": + _append_text(lines, _previous_text(tokens, index)) + elif token == "TJ": + if last_array_start is not None: + text = "".join( + item[1] + for item in tokens[last_array_start:index] + if isinstance(item, tuple) and item[0] == "text" + ) + _append_text(lines, text) + last_array_start = None + elif token == "'": + _new_line(lines) + _append_text(lines, _previous_text(tokens, index)) + elif token == '"': + _new_line(lines) + _append_text(lines, _previous_text(tokens, index)) + elif token in {"T*", "TD"}: + _new_line(lines) + elif token == "Td" and _td_moves_to_new_line(tokens, index): + _new_line(lines) + return "\n".join(line.rstrip() for line in lines).strip() + + +def _pdf_tokens(stream: bytes) -> list[str | tuple[str, str]]: + tokens: list[str | tuple[str, str]] = [] + index = 0 + while index < len(stream): + char = stream[index:index + 1] + if char in b" \t\r\n\f\x00": + index += 1 + continue + if char == b"%": + newline = stream.find(b"\n", index) + index = len(stream) if newline < 0 else newline + 1 + continue + if char == b"(": + text, index = _read_literal_string(stream, index) + tokens.append(("text", text)) + continue + if char == b"<" and stream[index:index + 2] != b"<<": + end = stream.find(b">", index + 1) + if end < 0: + break + tokens.append(("text", _decode_hex(stream[index + 1:end]))) + index = end + 1 + continue + if char in b"[]": + tokens.append(char.decode("ascii")) + index += 1 + continue + end = index + while end < len(stream) and stream[end:end + 1] not in b" \t\r\n\f\x00[]()<>/%": + end += 1 + if end == index: + index += 1 + continue + tokens.append(stream[index:end].decode("latin-1", errors="ignore")) + index = end + return tokens + + +def _read_literal_string(data: bytes, start: int) -> tuple[str, int]: + depth = 1 + index = start + 1 + output = bytearray() + while index < len(data) and depth > 0: + char = data[index] + if char == 0x5C: + escaped, index = _read_escape(data, index + 1) + output.extend(escaped) + continue + if char == 0x28: + depth += 1 + output.append(char) + elif char == 0x29: + depth -= 1 + if depth: + output.append(char) + else: + output.append(char) + index += 1 + return output.decode("utf-8", errors="replace"), index + + +def _read_escape(data: bytes, index: int) -> tuple[bytes, int]: + if index >= len(data): + return b"\\", index + char = data[index] + escapes = { + ord("n"): b"\n", + ord("r"): b"\r", + ord("t"): b"\t", + ord("b"): b"\b", + ord("f"): b"\f", + ord("("): b"(", + ord(")"): b")", + ord("\\"): b"\\", + } + if char in escapes: + return escapes[char], index + 1 + if char in b"\r\n": + if char == ord("\r") and index + 1 < len(data) and data[index + 1] == ord("\n"): + return b"", index + 2 + return b"", index + 1 + if 0x30 <= char <= 0x37: + end = index + 1 + while end < min(index + 3, len(data)) and 0x30 <= data[end] <= 0x37: + end += 1 + return bytes([int(data[index:end], 8)]), end + return bytes([char]), index + 1 + + +def _decode_hex(value: bytes) -> str: + cleaned = re.sub(rb"\s+", b"", value) + if len(cleaned) % 2: + cleaned += b"0" + try: + return bytes.fromhex(cleaned.decode("ascii")).decode("utf-8", errors="replace") + except ValueError: + return "" + + +def _previous_text(tokens: list[str | tuple[str, str]], index: int) -> str: + cursor = index - 1 + while cursor >= 0: + token = tokens[cursor] + if isinstance(token, tuple) and token[0] == "text": + return token[1] + if token in {"Tj", "TJ", "'", '"', "T*", "Td", "TD"}: + break + cursor -= 1 + return "" + + +def _td_moves_to_new_line(tokens: list[str | tuple[str, str]], index: int) -> bool: + if index < 2: + return False + y = tokens[index - 1] + try: + return isinstance(y, str) and float(y) < 0 + except ValueError: + return False + + +def _append_text(lines: list[str], text: str) -> None: + if text: + lines[-1] += text + + +def _new_line(lines: list[str]) -> None: + if lines[-1]: + lines.append("") + + +def _select_pages( + pages: list[PdfPage], + page_range: Any, + asset: SourceAsset, +) -> tuple[list[PdfPage], list[Diagnostic]]: + if page_range in (None, ""): + return pages, [] + selected_numbers, diagnostics = _parse_page_range(page_range, len(pages), asset) + selected = [page for page in pages if page.number in selected_numbers] + return selected, diagnostics + + +def _parse_page_range( + value: Any, + page_count: int, + asset: SourceAsset, +) -> tuple[set[int], list[Diagnostic]]: + diagnostics: list[Diagnostic] = [] + selected: set[int] = set() + if isinstance(value, int): + value = str(value) + if isinstance(value, (list, tuple)): + value = ",".join(str(item) for item in value) + if not isinstance(value, str): + return set(), [ + _pdf_error( + asset, + "source.pdf.invalid_page_range", + "PDF page_range option must be a string, integer, or list of integers.", + ) + ] + for part in value.split(","): + part = part.strip() + if not part: + continue + if "-" in part: + start_text, end_text = part.split("-", 1) + if not start_text.isdigit() or not end_text.isdigit(): + diagnostics.append(_invalid_page_range(asset, value)) + continue + start, end = int(start_text), int(end_text) + if start > end: + diagnostics.append(_invalid_page_range(asset, value)) + continue + selected.update(range(start, end + 1)) + elif part.isdigit(): + selected.add(int(part)) + else: + diagnostics.append(_invalid_page_range(asset, value)) + out_of_range = sorted(page for page in selected if page < 1 or page > page_count) + if out_of_range: + diagnostics.append( + _pdf_error( + asset, + "source.pdf.page_range_out_of_bounds", + "PDF page_range selects pages outside the document.", + details={"page_range": value, "page_count": page_count, "pages": out_of_range}, + ) + ) + selected = {page for page in selected if 1 <= page <= page_count} + if not selected and not diagnostics: + diagnostics.append(_invalid_page_range(asset, value)) + return selected, diagnostics + + +def _invalid_page_range(asset: SourceAsset, value: str) -> Diagnostic: + return _pdf_error( + asset, + "source.pdf.invalid_page_range", + "PDF page_range option is invalid.", + details={"page_range": value}, + ) + + +def _page_may_be_image_only( + page_body: bytes, + objects: dict[int, bytes], + content_ids: list[int], +) -> bool: + haystack = page_body + b"\n" + b"\n".join(objects.get(ref, b"") for ref in content_ids) + return bool(re.search(rb"/Subtype\s*/Image\b|\bDo\b", haystack)) + + +def _normalize_markdown(text: str, normalize_whitespace: bool) -> str: + lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n") + if normalize_whitespace: + lines = [re.sub(r"[ \t]+", " ", line).strip() for line in lines] + else: + lines = [line.rstrip() for line in lines] + cleaned: list[str] = [] + blank = False + for line in lines: + if not line: + if not blank and cleaned: + cleaned.append("") + blank = True + continue + cleaned.append(line) + blank = False + return "\n".join(cleaned).strip() + + +def _document_id(asset: SourceAsset, metadata: SourceMetadata) -> str: + title = metadata.title + if title: + slug = re.sub(r"[^a-z0-9._-]+", "-", title.lower()).strip("-") + if slug: + return f"source.pdf:{slug}" + if asset.digest: + return f"source.pdf:{asset.digest.removeprefix('sha256:')}" + return f"source.pdf:{asset.name or asset.uri}" + + +def _adapter_info(options: dict[str, Any]) -> dict[str, Any]: + return { + "id": "source.pdf", + "version": "1", + "options": options, + } + + +def _confidence(package: PdfPackage, diagnostics: list[Diagnostic]) -> float: + if has_error(diagnostics): + return 0.0 + if not package.pages: + return 0.0 + coverage = sum(1 for page in package.pages if page.text.strip()) / len(package.pages) + return max(0.1, 0.75 * coverage) + + +def _warning( + asset: SourceAsset, + code: str, + message: str, + *, + details: dict[str, Any] | None = None, +) -> Diagnostic: + return Diagnostic( + severity="warning", + code=code, + message=message, + source=SourceLocation(path=asset.path) if asset.path else None, + details=details or {}, + ) + + +def _pdf_error( + asset: SourceAsset, + code: str, + message: str, + *, + details: dict[str, Any] | None = None, +) -> Diagnostic: + return Diagnostic( + severity="error", + code=code, + message=message, + source=SourceLocation(path=asset.path) if asset.path else None, + details=details or {}, + ) + + +def _malformed( + asset: SourceAsset, + message: str, + details: dict[str, Any] | None = None, +) -> Diagnostic: + return _pdf_error(asset, "source.malformed", message, details=details) + + +def _warning_count(diagnostics: list[Diagnostic]) -> int: + return sum(1 for diagnostic in diagnostics if diagnostic.severity == "warning") + + +def _clean_text(text: str) -> str: + cleaned = re.sub(r"\s+", " ", text).strip() + return re.sub(r"\s+([.,;:!?])", r"\1", cleaned) diff --git a/tests/test_pdf_adapter.py b/tests/test_pdf_adapter.py new file mode 100644 index 0000000..0f0129c --- /dev/null +++ b/tests/test_pdf_adapter.py @@ -0,0 +1,242 @@ +from pathlib import Path + +from markitect_tool.source import ( + SourceAdapterMatchRequest, + SourceAdapterRegistry, + SourceAsset, + SourceInspectRequest, + SourceReadRequest, + discover_source_adapters, + inspect_source, + normalize_source, +) + +from markitect_filter.adapters import pdf_adapter_descriptor + + +class FakeEntryPoint: + name = "pdf" + + def load(self): + return pdf_adapter_descriptor + + +def test_pdf_descriptor_matches_contract(): + descriptor = pdf_adapter_descriptor() + + assert descriptor.id == "source.pdf" + assert descriptor.operations == ["read"] + assert descriptor.media_types == ["application/pdf"] + assert descriptor.extensions == [".pdf"] + assert descriptor.safety["network"] is False + assert descriptor.safety["external_process"] is False + assert descriptor.option_schema["properties"]["include_page_breaks"]["default"] is False + assert descriptor.metadata["dependency_profile"] == "stdlib" + + +def test_pdf_adapter_matches_pdf_assets(tmp_path: Path): + pdf_path = _write_pdf(tmp_path) + asset = SourceAsset.from_path(pdf_path, media_type="application/pdf") + adapter = pdf_adapter_descriptor().instantiate() + + match = adapter.can_read(SourceAdapterMatchRequest(asset=asset)) + + assert match.matched + assert match.confidence == 100 + + +def test_pdf_adapter_inspects_metadata(tmp_path: Path): + pdf_path = _write_pdf(tmp_path) + asset = SourceAsset.from_path(pdf_path, media_type="application/pdf") + adapter = pdf_adapter_descriptor().instantiate() + + result = adapter.inspect(SourceInspectRequest(asset=asset)) + + assert result.is_valid + assert result.metadata.title == "PDF Fixture" + assert result.metadata.creators == ["Ada Lovelace"] + assert result.metadata.publication_date == "D:20260514093000Z" + assert result.metadata.raw["subject"] == "Source Adapter Test" + assert result.quality.lossiness == "medium" + assert result.quality.metadata["page_count"] == 2 + assert result.quality.metadata["pages_with_text"] == 2 + + +def test_pdf_adapter_normalizes_pages_to_markdown(tmp_path: Path): + pdf_path = _write_pdf(tmp_path) + asset = SourceAsset.from_path(pdf_path, media_type="application/pdf") + adapter = pdf_adapter_descriptor().instantiate() + + result = adapter.read(SourceReadRequest(asset=asset)) + + assert result.is_valid + assert result.document is not None + assert result.document.document_id == "source.pdf:pdf-fixture" + assert result.document.markdown == "Hello PDF\nSecond line\n\nPage two text." + assert [segment.segment_id for segment in result.document.segments] == [ + "page-0001", + "page-0002", + ] + assert result.document.segments[0].provenance[0].page == "1" + assert result.document.quality.lossiness == "low" + assert result.document.quality.metadata["page_coverage"] == 1.0 + + +def test_pdf_adapter_applies_page_range_and_page_markers(tmp_path: Path): + pdf_path = _write_pdf(tmp_path) + asset = SourceAsset.from_path(pdf_path, media_type="application/pdf") + adapter = pdf_adapter_descriptor().instantiate() + + result = adapter.read( + SourceReadRequest( + asset=asset, + options={"page_range": "2", "include_page_breaks": True}, + ) + ) + + assert result.is_valid + assert result.document is not None + assert result.document.markdown == "\n\nPage two text." + assert result.document.segments[0].metadata["page"] == 2 + + +def test_markitect_api_can_use_pdf_registry(tmp_path: Path): + pdf_path = _write_pdf(tmp_path) + registry = SourceAdapterRegistry([pdf_adapter_descriptor()]) + + inspected = inspect_source(pdf_path, registry=registry) + normalized = normalize_source(pdf_path, registry=registry) + + assert inspected.is_valid + assert inspected.metadata.title == "PDF Fixture" + assert normalized.is_valid + assert normalized.document is not None + assert normalized.document.segments[1].markdown == "Page two text." + + +def test_pdf_adapter_reports_malformed_pdf(tmp_path: Path): + pdf_path = tmp_path / "broken.pdf" + pdf_path.write_bytes(b"not a pdf") + asset = SourceAsset.from_path(pdf_path, media_type="application/pdf") + adapter = pdf_adapter_descriptor().instantiate() + + result = adapter.read(SourceReadRequest(asset=asset)) + + assert not result.is_valid + assert result.diagnostics[0].code == "source.malformed" + + +def test_pdf_adapter_reports_encrypted_pdf(tmp_path: Path): + pdf_path = _write_pdf(tmp_path, encrypted=True) + asset = SourceAsset.from_path(pdf_path, media_type="application/pdf") + adapter = pdf_adapter_descriptor().instantiate() + + result = adapter.read(SourceReadRequest(asset=asset)) + + assert not result.is_valid + assert result.diagnostics[0].code == "source.pdf.encrypted" + + +def test_pdf_entry_point_discovery_shape(): + registry = discover_source_adapters([FakeEntryPoint()]) + + assert registry.get("source.pdf").name == "PDF" + + +def _write_pdf(tmp_path: Path, *, encrypted: bool = False) -> Path: + pdf_path = tmp_path / ("encrypted.pdf" if encrypted else "fixture.pdf") + objects: list[tuple[int, bytes]] = [] + page_refs = [] + next_id = 3 + for page_number, lines in enumerate( + [ + ["Hello PDF", "Second line"], + ["Page two text."], + ], + start=1, + ): + page_id = next_id + content_id = next_id + 1 + next_id += 2 + page_refs.append(f"{page_id} 0 R") + stream = _page_stream(lines) + objects.append( + ( + page_id, + ( + f"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] " + f"/Resources << /Font << /F1 7 0 R >> >> /Contents {content_id} 0 R >>" + ).encode("ascii"), + ) + ) + objects.append( + ( + content_id, + b"<< /Length " + + str(len(stream)).encode("ascii") + + b" >>\nstream\n" + + stream + + b"\nendstream", + ) + ) + + objects.extend( + [ + (1, b"<< /Type /Catalog /Pages 2 0 R >>"), + ( + 2, + ( + f"<< /Type /Pages /Kids [{' '.join(page_refs)}] " + f"/Count {len(page_refs)} >>" + ).encode("ascii"), + ), + (7, b"<< /Type /Font /Subtype /Type1 /BaseFont /Helvetica >>"), + ( + 8, + b"<< /Title (PDF Fixture) /Author (Ada Lovelace) " + b"/Subject (Source Adapter Test) /Keywords (markitect pdf) " + b"/Producer (markitect-filter tests) /CreationDate (D:20260514093000Z) >>", + ), + ] + ) + if encrypted: + objects.append((9, b"<< /Filter /Standard /V 1 /R 2 >>")) + objects.sort(key=lambda item: item[0]) + + header = b"%PDF-1.4\n%\xe2\xe3\xcf\xd3\n" + content = bytearray(header) + max_id = max(object_id for object_id, _ in objects) + offsets = {0: 0} + for object_id, body in objects: + offsets[object_id] = len(content) + content.extend(f"{object_id} 0 obj\n".encode("ascii")) + content.extend(body) + content.extend(b"\nendobj\n") + + xref_offset = len(content) + content.extend(f"xref\n0 {max_id + 1}\n".encode("ascii")) + content.extend(b"0000000000 65535 f \n") + for object_id in range(1, max_id + 1): + content.extend(f"{offsets.get(object_id, 0):010d} 00000 n \n".encode("ascii")) + trailer = f"trailer\n<< /Size {max_id + 1} /Root 1 0 R /Info 8 0 R".encode("ascii") + if encrypted: + trailer += b" /Encrypt 9 0 R" + trailer += b" >>\n" + content.extend(trailer) + content.extend(f"startxref\n{xref_offset}\n%%EOF\n".encode("ascii")) + pdf_path.write_bytes(bytes(content)) + return pdf_path + + +def _page_stream(lines: list[str]) -> bytes: + parts = ["BT", "/F1 12 Tf", "72 720 Td"] + for index, line in enumerate(lines): + if index: + parts.append("T*") + parts.append(f"({_pdf_literal(line)}) Tj") + parts.append("ET") + return "\n".join(parts).encode("ascii") + + +def _pdf_literal(text: str) -> str: + return text.replace("\\", "\\\\").replace("(", "\\(").replace(")", "\\)") diff --git a/workplans/MKTF-WP-0002-pdf-read-adapter.md b/workplans/MKTF-WP-0002-pdf-read-adapter.md index c3c6ba3..07e4f97 100644 --- a/workplans/MKTF-WP-0002-pdf-read-adapter.md +++ b/workplans/MKTF-WP-0002-pdf-read-adapter.md @@ -3,10 +3,10 @@ id: MKTF-WP-0002 type: workplan title: "PDF Read Adapter" domain: markitect -status: todo +status: done owner: markitect-filter topic_slug: markitect -planning_priority: P1 +planning_priority: complete planning_order: 20 depends_on_workplans: - MKTF-WP-0001 @@ -32,7 +32,7 @@ The first PDF slice should target deterministic text extraction from digitally-readable PDFs. It should preserve page-level provenance and make extraction uncertainty visible through diagnostics and quality signals. -## Planned Scope +## Implemented Scope - Optional PDF dependency profile isolated behind a `pdf` extra. - Entry point group registration: @@ -72,7 +72,7 @@ extraction uncertainty visible through diagnostics and quality signals. ```task id: MKTF-WP-0002-T001 -status: todo +status: done priority: high state_hub_task_id: "2ce51bb9-9182-4927-90d1-4c08433b5ddb" ``` @@ -91,11 +91,16 @@ The decision should document: Output: dependency decision, option contract, and implementation notes. +Implemented: `docs/pdf-adapter.md`, `pyproject.toml`, and the descriptor +metadata document a stdlib first slice, a reserved `pdf` extra, local +digitally-readable PDF support, page range/page marker/whitespace options, and +deferred OCR/layout-heavy backends. + ## P2.2 - Add descriptor and entry point registration ```task id: MKTF-WP-0002-T002 -status: todo +status: done priority: high state_hub_task_id: "27d754a9-59ae-4419-946b-f1f847bd3b10" ``` @@ -116,11 +121,15 @@ The descriptor should define: Output: descriptor, entry point registration, and descriptor tests. +Implemented: `pdf_adapter_descriptor` is registered through +`markitect_tool.source_adapters`, exported from the package, and covered by +descriptor and discovery tests. + ## P2.3 - Implement PDF inspection ```task id: MKTF-WP-0002-T003 -status: todo +status: done priority: high state_hub_task_id: "33b594e6-d12a-46d5-bc50-6ec1aebaaf65" ``` @@ -138,11 +147,15 @@ Inspection should report: Output: inspection implementation and tests with small fixtures. +Implemented: `PdfReadAdapter.inspect` reports metadata, page count, +extractability signals, encryption status, quality metadata, and malformed or +encrypted diagnostics using deterministic generated fixtures. + ## P2.4 - Normalize page text into Markitect Markdown ```task id: MKTF-WP-0002-T004 -status: todo +status: done priority: high state_hub_task_id: "30c0c777-a4e4-43d1-ac24-6a0f84c7b761" ``` @@ -162,11 +175,15 @@ Normalization should: Output: read implementation and normalization tests. +Implemented: `PdfReadAdapter.read` extracts ordered page text into stable +page segments, applies page ranges, supports optional page markers, preserves +page provenance, and uses the Markitect cache-key helpers. + ## P2.5 - Add diagnostics and quality semantics ```task id: MKTF-WP-0002-T005 -status: todo +status: done priority: high state_hub_task_id: "8b6a190a-350b-4c61-ac4f-1900673a8cd2" ``` @@ -188,11 +205,17 @@ skipped pages, lossiness, and confidence. Output: diagnostic helpers, quality rules, and tests. +Implemented: PDF diagnostics cover malformed files, unreadable files, +encrypted PDFs, invalid page ranges, missing/empty streams, image-only pages, +empty extraction, and stream decompression failures. Quality metadata records +backend, page count, selected pages, extracted pages, coverage, warnings, and +skipped pages. + ## P2.6 - Add fixtures, docs, and validation ```task id: MKTF-WP-0002-T006 -status: todo +status: done priority: medium state_hub_task_id: "af597160-e189-42be-8479-c6e0f467d238" ``` @@ -211,6 +234,11 @@ Validation should cover: Output: tests, README update, and validation command. +Implemented: generated PDF fixtures and tests cover descriptor shape, matching, +metadata inspection, normalization, page range markers, malformed PDFs, +encrypted PDFs, registry use, entry point discovery, README documentation, and +the validation command below. + ## Validation Run from `markitect-filter`: