feat(source): add pdf read adapter

2026-05-14 23:33:31 +02:00
parent 24ee499b50
commit 0c9a418e85
8 changed files with 1176 additions and 13 deletions
--- a/src/markitect_filter/init.py
+++ b/src/markitect_filter/init.py
@@ -1,5 +1,5 @@
 """Concrete source-format adapters for Markitect."""

-from markitect_filter.adapters import epub3_adapter_descriptor
+from markitect_filter.adapters import epub3_adapter_descriptor, pdf_adapter_descriptor

-__all__ = ["epub3_adapter_descriptor"]
+__all__ = ["epub3_adapter_descriptor", "pdf_adapter_descriptor"]
--- a/src/markitect_filter/adapters.py
+++ b/src/markitect_filter/adapters.py
@@ -49,3 +49,59 @@ def epub3_adapter_descriptor() -> SourceAdapterDescriptor:
            "dependency_profile": "stdlib",
        },
    )
+
+
+def pdf_adapter_descriptor() -> SourceAdapterDescriptor:
+    """Return the lightweight PDF read adapter descriptor."""
+
+    def factory():
+        from markitect_filter.pdf import PdfReadAdapter
+
+        return PdfReadAdapter()
+
+    return SourceAdapterDescriptor(
+        id="source.pdf",
+        version="1",
+        name="PDF",
+        summary="Read digitally-readable PDFs into canonical Markitect Markdown.",
+        operations=["read"],
+        media_types=["application/pdf"],
+        extensions=[".pdf"],
+        factory=factory,
+        option_schema={
+            "type": "object",
+            "properties": {
+                "page_range": {
+                    "type": "string",
+                    "description": "Optional 1-based page range such as `1-3,5`.",
+                },
+                "include_page_breaks": {
+                    "type": "boolean",
+                    "default": False,
+                    "description": "Prefix each page segment with a Markdown comment page marker.",
+                },
+                "normalize_whitespace": {
+                    "type": "boolean",
+                    "default": True,
+                    "description": "Collapse repeated horizontal whitespace while preserving extracted line breaks.",
+                },
+            },
+            "additionalProperties": False,
+        },
+        safety={
+            "reads_files": True,
+            "writes_files": False,
+            "network": False,
+            "external_process": False,
+        },
+        quality_profile={
+            "text_extraction": "stdlib-pdf-text",
+            "images": "diagnostic-only",
+            "styles": "ignored",
+            "tables": "plain-text-only",
+        },
+        metadata={
+            "format": "PDF",
+            "dependency_profile": "stdlib",
+        },
+    )
--- a/src/markitect_filter/pdf.py
+++ b/src/markitect_filter/pdf.py
@@ -0,0 +1,782 @@
+"""PDF read adapter implementation."""
+
+from __future__ import annotations
+
+import re
+import zlib
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+from markitect_tool.diagnostics import Diagnostic, SourceLocation, has_error
+from markitect_tool.source import (
+    NormalizationQuality,
+    NormalizedMarkdownDocument,
+    NormalizedMarkdownSegment,
+    SourceAdapterMatch,
+    SourceAdapterMatchRequest,
+    SourceAsset,
+    SourceInspectRequest,
+    SourceInspectResult,
+    SourceMetadata,
+    SourceProvenance,
+    SourceReadRequest,
+    SourceReadResult,
+    normalization_cache_key,
+)
+
+from markitect_filter.adapters import pdf_adapter_descriptor
+
+
+PDF_HEADER_RE = re.compile(rb"%PDF-\d\.\d")
+OBJECT_RE = re.compile(rb"(\d+)\s+(\d+)\s+obj\b(.*?)\bendobj", re.DOTALL)
+STREAM_RE = re.compile(rb"stream\r?\n(.*?)\r?\nendstream", re.DOTALL)
+PAGE_TYPE_RE = re.compile(rb"/Type\s*/Page\b")
+PAGES_TYPE_RE = re.compile(rb"/Type\s*/Pages\b")
+REF_RE = re.compile(rb"(\d+)\s+\d+\s+R")
+INFO_REF_RE = re.compile(rb"/Info\s+(\d+)\s+\d+\s+R")
+COUNT_RE = re.compile(rb"/Count\s+(\d+)")
+
+
+@dataclass(frozen=True)
+class PdfPage:
+    number: int
+    object_id: int
+    text: str
+    diagnostics: list[Diagnostic]
+
+
+@dataclass(frozen=True)
+class PdfPackage:
+    metadata: SourceMetadata
+    page_count: int
+    encrypted: bool
+    pages: list[PdfPage]
+    diagnostics: list[Diagnostic]
+
+
+class PdfReadAdapter:
+    """Read digitally-readable PDFs into normalized Markitect Markdown."""
+
+    def __init__(self) -> None:
+        self.descriptor = pdf_adapter_descriptor()
+
+    def can_read(self, request: SourceAdapterMatchRequest) -> SourceAdapterMatch:
+        asset = request.asset
+        if asset.media_type == "application/pdf":
+            return SourceAdapterMatch(
+                adapter_id=self.descriptor.id,
+                matched=True,
+                confidence=100,
+                reason="media_type",
+            )
+        if asset.extension == ".pdf":
+            return SourceAdapterMatch(
+                adapter_id=self.descriptor.id,
+                matched=True,
+                confidence=80,
+                reason="extension",
+            )
+        return SourceAdapterMatch(
+            adapter_id=self.descriptor.id,
+            matched=False,
+            confidence=0,
+            reason="unsupported",
+        )
+
+    def inspect(self, request: SourceInspectRequest) -> SourceInspectResult:
+        package = _load_pdf(request.asset)
+        diagnostics = package.diagnostics
+        extracted_pages = sum(1 for page in package.pages if page.text.strip())
+        return SourceInspectResult(
+            asset=request.asset,
+            adapter=_adapter_info(request.options),
+            metadata=package.metadata,
+            capabilities=["read"],
+            quality=NormalizationQuality(
+                lossiness="unknown" if has_error(diagnostics) else "medium",
+                confidence=_confidence(package, diagnostics),
+                warnings=_warning_count(diagnostics),
+                metadata={
+                    "extraction": "pdf-stdlib-text",
+                    "page_count": package.page_count,
+                    "pages_with_text": extracted_pages,
+                    "encrypted": package.encrypted,
+                },
+            ),
+            diagnostics=diagnostics,
+            valid=not has_error(diagnostics),
+        )
+
+    def read(self, request: SourceReadRequest) -> SourceReadResult:
+        package = _load_pdf(request.asset)
+        if has_error(package.diagnostics):
+            return SourceReadResult(diagnostics=package.diagnostics, valid=False)
+
+        selected_pages, page_range_diagnostics = _select_pages(
+            package.pages,
+            request.options.get("page_range"),
+            request.asset,
+        )
+        diagnostics = [*package.diagnostics, *page_range_diagnostics]
+        if has_error(diagnostics):
+            return SourceReadResult(diagnostics=diagnostics, valid=False)
+
+        normalize_whitespace = bool(request.options.get("normalize_whitespace", True))
+        include_page_breaks = bool(request.options.get("include_page_breaks", False))
+        segments: list[NormalizedMarkdownSegment] = []
+        skipped_pages = 0
+        for page in selected_pages:
+            page_text = _normalize_markdown(page.text, normalize_whitespace)
+            diagnostics.extend(page.diagnostics)
+            if not page_text:
+                skipped_pages += 1
+                diagnostics.append(
+                    _warning(
+                        request.asset,
+                        "source.pdf.empty_page",
+                        f"PDF page {page.number} did not produce extractable text.",
+                        details={"page": page.number},
+                    )
+                )
+                continue
+            markdown = (
+                f"<!-- page: {page.number} -->\n\n{page_text}"
+                if include_page_breaks
+                else page_text
+            )
+            segments.append(
+                NormalizedMarkdownSegment(
+                    segment_id=f"page-{page.number:04d}",
+                    order=len(segments),
+                    markdown=markdown,
+                    provenance=[
+                        SourceProvenance(
+                            source_uri=request.asset.uri,
+                            source_path=request.asset.path,
+                            page=str(page.number),
+                            digest=request.asset.digest,
+                            metadata={"pdf_object": page.object_id},
+                        )
+                    ],
+                    metadata={"page": page.number, "pdf_object": page.object_id},
+                )
+            )
+
+        if not segments:
+            diagnostics.append(
+                _pdf_error(
+                    request.asset,
+                    "source.pdf.no_extractable_text",
+                    "PDF did not produce any extractable Markdown text.",
+                    details={"page_count": package.page_count},
+                )
+            )
+            return SourceReadResult(diagnostics=diagnostics, valid=False)
+
+        markdown = "\n\n".join(segment.markdown.strip() for segment in segments)
+        page_coverage = len(segments) / max(len(selected_pages), 1)
+        warning_count = _warning_count(diagnostics)
+        quality = NormalizationQuality(
+            lossiness="medium" if warning_count else "low",
+            confidence=max(0.1, 0.75 * page_coverage) if not has_error(diagnostics) else 0.0,
+            skipped_items=skipped_pages,
+            warnings=warning_count,
+            metadata={
+                "extraction": "pdf-stdlib-text",
+                "page_count": package.page_count,
+                "selected_pages": [page.number for page in selected_pages],
+                "pages_extracted": len(segments),
+                "page_coverage": page_coverage,
+            },
+        )
+        document = NormalizedMarkdownDocument(
+            document_id=_document_id(request.asset, package.metadata),
+            asset=request.asset,
+            metadata=package.metadata,
+            markdown=markdown,
+            segments=segments,
+            quality=quality,
+            diagnostics=diagnostics,
+            provenance=[
+                SourceProvenance(
+                    source_uri=request.asset.uri,
+                    source_path=request.asset.path,
+                    digest=request.asset.digest,
+                    metadata={"page_count": package.page_count},
+                )
+            ],
+            adapter=_adapter_info(request.options),
+            cache_key=normalization_cache_key(
+                asset=request.asset,
+                adapter_id=self.descriptor.id,
+                adapter_version=self.descriptor.version,
+                options=request.options,
+            ),
+        )
+        return SourceReadResult(document=document, diagnostics=[], valid=not has_error(diagnostics))
+
+
+def _load_pdf(asset: SourceAsset) -> PdfPackage:
+    diagnostics: list[Diagnostic] = []
+    try:
+        data = Path(asset.path or asset.uri).read_bytes()
+    except OSError as exc:
+        return PdfPackage(
+            metadata=SourceMetadata(),
+            page_count=0,
+            encrypted=False,
+            pages=[],
+            diagnostics=[
+                _pdf_error(
+                    asset,
+                    "source.pdf.unreadable",
+                    "PDF file could not be read.",
+                    details={"error": str(exc)},
+                )
+            ],
+        )
+
+    if not PDF_HEADER_RE.match(data.lstrip()):
+        return PdfPackage(
+            metadata=SourceMetadata(),
+            page_count=0,
+            encrypted=False,
+            pages=[],
+            diagnostics=[_malformed(asset, "PDF does not start with a PDF header.")],
+        )
+
+    objects = _parse_objects(data)
+    encrypted = bool(re.search(rb"/Encrypt\b", data))
+    metadata = _extract_metadata(data, objects, encrypted)
+    if encrypted:
+        return PdfPackage(
+            metadata=metadata,
+            page_count=_page_count(objects),
+            encrypted=True,
+            pages=[],
+            diagnostics=[
+                _pdf_error(
+                    asset,
+                    "source.pdf.encrypted",
+                    "PDF is encrypted or declares an encryption dictionary.",
+                )
+            ],
+        )
+
+    page_ids = _page_object_ids(objects)
+    page_count = _page_count(objects) or len(page_ids)
+    pages: list[PdfPage] = []
+    for page_number, object_id in enumerate(page_ids, start=1):
+        page_body = objects[object_id]
+        page_diagnostics: list[Diagnostic] = []
+        content_ids = _content_refs(page_body)
+        text_parts: list[str] = []
+        if not content_ids and STREAM_RE.search(page_body):
+            stream = _stream_data(page_body, asset, page_diagnostics)
+            if stream:
+                text_parts.append(_extract_stream_text(stream))
+        for content_id in content_ids:
+            content_body = objects.get(content_id)
+            if content_body is None:
+                page_diagnostics.append(
+                    _warning(
+                        asset,
+                        "source.pdf.missing_content_stream",
+                        f"PDF page {page_number} references missing content object {content_id}.",
+                        details={"page": page_number, "object_id": content_id},
+                    )
+                )
+                continue
+            stream = _stream_data(content_body, asset, page_diagnostics)
+            if stream:
+                text_parts.append(_extract_stream_text(stream))
+        text = "\n".join(part for part in text_parts if part.strip()).strip()
+        if _page_may_be_image_only(page_body, objects, content_ids) and not text:
+            page_diagnostics.append(
+                _warning(
+                    asset,
+                    "source.pdf.image_only_page",
+                    f"PDF page {page_number} appears to contain image content without extractable text.",
+                    details={"page": page_number},
+                )
+            )
+        pages.append(PdfPage(page_number, object_id, text, page_diagnostics))
+
+    if not page_ids:
+        diagnostics.append(_malformed(asset, "PDF does not declare any page objects."))
+
+    return PdfPackage(
+        metadata=metadata,
+        page_count=page_count,
+        encrypted=False,
+        pages=pages,
+        diagnostics=diagnostics,
+    )
+
+
+def _parse_objects(data: bytes) -> dict[int, bytes]:
+    return {int(match.group(1)): match.group(3) for match in OBJECT_RE.finditer(data)}
+
+
+def _page_object_ids(objects: dict[int, bytes]) -> list[int]:
+    return [
+        object_id
+        for object_id, body in sorted(objects.items())
+        if PAGE_TYPE_RE.search(body) and not PAGES_TYPE_RE.search(body)
+    ]
+
+
+def _page_count(objects: dict[int, bytes]) -> int:
+    counts = [
+        int(match.group(1))
+        for body in objects.values()
+        if PAGES_TYPE_RE.search(body)
+        for match in [COUNT_RE.search(body)]
+        if match is not None
+    ]
+    return max(counts) if counts else 0
+
+
+def _content_refs(page_body: bytes) -> list[int]:
+    refs: list[int] = []
+    array_match = re.search(rb"/Contents\s*\[(.*?)\]", page_body, re.DOTALL)
+    if array_match:
+        refs.extend(int(match.group(1)) for match in REF_RE.finditer(array_match.group(1)))
+    direct_match = re.search(rb"/Contents\s+(\d+)\s+\d+\s+R", page_body)
+    if direct_match:
+        ref = int(direct_match.group(1))
+        if ref not in refs:
+            refs.append(ref)
+    return refs
+
+
+def _stream_data(
+    object_body: bytes,
+    asset: SourceAsset,
+    diagnostics: list[Diagnostic],
+) -> bytes:
+    match = STREAM_RE.search(object_body)
+    if match is None:
+        diagnostics.append(
+            _warning(
+                asset,
+                "source.pdf.missing_stream",
+                "PDF content object does not contain a readable stream.",
+            )
+        )
+        return b""
+    stream = match.group(1)
+    if b"/FlateDecode" in object_body:
+        try:
+            return zlib.decompress(stream)
+        except zlib.error as exc:
+            diagnostics.append(
+                _warning(
+                    asset,
+                    "source.pdf.flate_decode_failed",
+                    "PDF FlateDecode stream could not be decompressed.",
+                    details={"error": str(exc)},
+                )
+            )
+            return b""
+    return stream
+
+
+def _extract_metadata(
+    data: bytes,
+    objects: dict[int, bytes],
+    encrypted: bool,
+) -> SourceMetadata:
+    info_body = b""
+    info_ref = INFO_REF_RE.search(data)
+    if info_ref:
+        info_body = objects.get(int(info_ref.group(1)), b"")
+    if not info_body:
+        info_body = data
+
+    title = _metadata_value(info_body, "Title")
+    author = _metadata_value(info_body, "Author")
+    creation_date = _metadata_value(info_body, "CreationDate")
+    modification_date = _metadata_value(info_body, "ModDate")
+    raw = {
+        key: value
+        for key, value in {
+            "subject": _metadata_value(info_body, "Subject"),
+            "keywords": _metadata_value(info_body, "Keywords"),
+            "producer": _metadata_value(info_body, "Producer"),
+            "creator": _metadata_value(info_body, "Creator"),
+            "creation_date": creation_date,
+            "modification_date": modification_date,
+            "encrypted": encrypted,
+        }.items()
+        if value not in (None, "")
+    }
+    return SourceMetadata(
+        title=title,
+        creators=[author] if author else [],
+        publication_date=creation_date,
+        raw=raw,
+    )
+
+
+def _metadata_value(body: bytes, key: str) -> str | None:
+    key_bytes = f"/{key}".encode("ascii")
+    index = body.find(key_bytes)
+    if index < 0:
+        return None
+    index += len(key_bytes)
+    while index < len(body) and body[index] in b" \t\r\n":
+        index += 1
+    if index >= len(body):
+        return None
+    if body[index:index + 1] == b"(":
+        value, _ = _read_literal_string(body, index)
+        return _clean_text(value)
+    if body[index:index + 1] == b"<" and body[index:index + 2] != b"<<":
+        end = body.find(b">", index + 1)
+        if end > index:
+            return _clean_text(_decode_hex(body[index + 1:end]))
+    return None
+
+
+def _extract_stream_text(stream: bytes) -> str:
+    tokens = _pdf_tokens(stream)
+    lines = [""]
+    last_array_start: int | None = None
+    for index, token in enumerate(tokens):
+        if token == "[":
+            last_array_start = index
+            continue
+        if token == "Tj":
+            _append_text(lines, _previous_text(tokens, index))
+        elif token == "TJ":
+            if last_array_start is not None:
+                text = "".join(
+                    item[1]
+                    for item in tokens[last_array_start:index]
+                    if isinstance(item, tuple) and item[0] == "text"
+                )
+                _append_text(lines, text)
+            last_array_start = None
+        elif token == "'":
+            _new_line(lines)
+            _append_text(lines, _previous_text(tokens, index))
+        elif token == '"':
+            _new_line(lines)
+            _append_text(lines, _previous_text(tokens, index))
+        elif token in {"T*", "TD"}:
+            _new_line(lines)
+        elif token == "Td" and _td_moves_to_new_line(tokens, index):
+            _new_line(lines)
+    return "\n".join(line.rstrip() for line in lines).strip()
+
+
+def _pdf_tokens(stream: bytes) -> list[str | tuple[str, str]]:
+    tokens: list[str | tuple[str, str]] = []
+    index = 0
+    while index < len(stream):
+        char = stream[index:index + 1]
+        if char in b" \t\r\n\f\x00":
+            index += 1
+            continue
+        if char == b"%":
+            newline = stream.find(b"\n", index)
+            index = len(stream) if newline < 0 else newline + 1
+            continue
+        if char == b"(":
+            text, index = _read_literal_string(stream, index)
+            tokens.append(("text", text))
+            continue
+        if char == b"<" and stream[index:index + 2] != b"<<":
+            end = stream.find(b">", index + 1)
+            if end < 0:
+                break
+            tokens.append(("text", _decode_hex(stream[index + 1:end])))
+            index = end + 1
+            continue
+        if char in b"[]":
+            tokens.append(char.decode("ascii"))
+            index += 1
+            continue
+        end = index
+        while end < len(stream) and stream[end:end + 1] not in b" \t\r\n\f\x00[]()<>/%":
+            end += 1
+        if end == index:
+            index += 1
+            continue
+        tokens.append(stream[index:end].decode("latin-1", errors="ignore"))
+        index = end
+    return tokens
+
+
+def _read_literal_string(data: bytes, start: int) -> tuple[str, int]:
+    depth = 1
+    index = start + 1
+    output = bytearray()
+    while index < len(data) and depth > 0:
+        char = data[index]
+        if char == 0x5C:
+            escaped, index = _read_escape(data, index + 1)
+            output.extend(escaped)
+            continue
+        if char == 0x28:
+            depth += 1
+            output.append(char)
+        elif char == 0x29:
+            depth -= 1
+            if depth:
+                output.append(char)
+        else:
+            output.append(char)
+        index += 1
+    return output.decode("utf-8", errors="replace"), index
+
+
+def _read_escape(data: bytes, index: int) -> tuple[bytes, int]:
+    if index >= len(data):
+        return b"\\", index
+    char = data[index]
+    escapes = {
+        ord("n"): b"\n",
+        ord("r"): b"\r",
+        ord("t"): b"\t",
+        ord("b"): b"\b",
+        ord("f"): b"\f",
+        ord("("): b"(",
+        ord(")"): b")",
+        ord("\\"): b"\\",
+    }
+    if char in escapes:
+        return escapes[char], index + 1
+    if char in b"\r\n":
+        if char == ord("\r") and index + 1 < len(data) and data[index + 1] == ord("\n"):
+            return b"", index + 2
+        return b"", index + 1
+    if 0x30 <= char <= 0x37:
+        end = index + 1
+        while end < min(index + 3, len(data)) and 0x30 <= data[end] <= 0x37:
+            end += 1
+        return bytes([int(data[index:end], 8)]), end
+    return bytes([char]), index + 1
+
+
+def _decode_hex(value: bytes) -> str:
+    cleaned = re.sub(rb"\s+", b"", value)
+    if len(cleaned) % 2:
+        cleaned += b"0"
+    try:
+        return bytes.fromhex(cleaned.decode("ascii")).decode("utf-8", errors="replace")
+    except ValueError:
+        return ""
+
+
+def _previous_text(tokens: list[str | tuple[str, str]], index: int) -> str:
+    cursor = index - 1
+    while cursor >= 0:
+        token = tokens[cursor]
+        if isinstance(token, tuple) and token[0] == "text":
+            return token[1]
+        if token in {"Tj", "TJ", "'", '"', "T*", "Td", "TD"}:
+            break
+        cursor -= 1
+    return ""
+
+
+def _td_moves_to_new_line(tokens: list[str | tuple[str, str]], index: int) -> bool:
+    if index < 2:
+        return False
+    y = tokens[index - 1]
+    try:
+        return isinstance(y, str) and float(y) < 0
+    except ValueError:
+        return False
+
+
+def _append_text(lines: list[str], text: str) -> None:
+    if text:
+        lines[-1] += text
+
+
+def _new_line(lines: list[str]) -> None:
+    if lines[-1]:
+        lines.append("")
+
+
+def _select_pages(
+    pages: list[PdfPage],
+    page_range: Any,
+    asset: SourceAsset,
+) -> tuple[list[PdfPage], list[Diagnostic]]:
+    if page_range in (None, ""):
+        return pages, []
+    selected_numbers, diagnostics = _parse_page_range(page_range, len(pages), asset)
+    selected = [page for page in pages if page.number in selected_numbers]
+    return selected, diagnostics
+
+
+def _parse_page_range(
+    value: Any,
+    page_count: int,
+    asset: SourceAsset,
+) -> tuple[set[int], list[Diagnostic]]:
+    diagnostics: list[Diagnostic] = []
+    selected: set[int] = set()
+    if isinstance(value, int):
+        value = str(value)
+    if isinstance(value, (list, tuple)):
+        value = ",".join(str(item) for item in value)
+    if not isinstance(value, str):
+        return set(), [
+            _pdf_error(
+                asset,
+                "source.pdf.invalid_page_range",
+                "PDF page_range option must be a string, integer, or list of integers.",
+            )
+        ]
+    for part in value.split(","):
+        part = part.strip()
+        if not part:
+            continue
+        if "-" in part:
+            start_text, end_text = part.split("-", 1)
+            if not start_text.isdigit() or not end_text.isdigit():
+                diagnostics.append(_invalid_page_range(asset, value))
+                continue
+            start, end = int(start_text), int(end_text)
+            if start > end:
+                diagnostics.append(_invalid_page_range(asset, value))
+                continue
+            selected.update(range(start, end + 1))
+        elif part.isdigit():
+            selected.add(int(part))
+        else:
+            diagnostics.append(_invalid_page_range(asset, value))
+    out_of_range = sorted(page for page in selected if page < 1 or page > page_count)
+    if out_of_range:
+        diagnostics.append(
+            _pdf_error(
+                asset,
+                "source.pdf.page_range_out_of_bounds",
+                "PDF page_range selects pages outside the document.",
+                details={"page_range": value, "page_count": page_count, "pages": out_of_range},
+            )
+        )
+    selected = {page for page in selected if 1 <= page <= page_count}
+    if not selected and not diagnostics:
+        diagnostics.append(_invalid_page_range(asset, value))
+    return selected, diagnostics
+
+
+def _invalid_page_range(asset: SourceAsset, value: str) -> Diagnostic:
+    return _pdf_error(
+        asset,
+        "source.pdf.invalid_page_range",
+        "PDF page_range option is invalid.",
+        details={"page_range": value},
+    )
+
+
+def _page_may_be_image_only(
+    page_body: bytes,
+    objects: dict[int, bytes],
+    content_ids: list[int],
+) -> bool:
+    haystack = page_body + b"\n" + b"\n".join(objects.get(ref, b"") for ref in content_ids)
+    return bool(re.search(rb"/Subtype\s*/Image\b|\bDo\b", haystack))
+
+
+def _normalize_markdown(text: str, normalize_whitespace: bool) -> str:
+    lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
+    if normalize_whitespace:
+        lines = [re.sub(r"[ \t]+", " ", line).strip() for line in lines]
+    else:
+        lines = [line.rstrip() for line in lines]
+    cleaned: list[str] = []
+    blank = False
+    for line in lines:
+        if not line:
+            if not blank and cleaned:
+                cleaned.append("")
+            blank = True
+            continue
+        cleaned.append(line)
+        blank = False
+    return "\n".join(cleaned).strip()
+
+
+def _document_id(asset: SourceAsset, metadata: SourceMetadata) -> str:
+    title = metadata.title
+    if title:
+        slug = re.sub(r"[^a-z0-9._-]+", "-", title.lower()).strip("-")
+        if slug:
+            return f"source.pdf:{slug}"
+    if asset.digest:
+        return f"source.pdf:{asset.digest.removeprefix('sha256:')}"
+    return f"source.pdf:{asset.name or asset.uri}"
+
+
+def _adapter_info(options: dict[str, Any]) -> dict[str, Any]:
+    return {
+        "id": "source.pdf",
+        "version": "1",
+        "options": options,
+    }
+
+
+def _confidence(package: PdfPackage, diagnostics: list[Diagnostic]) -> float:
+    if has_error(diagnostics):
+        return 0.0
+    if not package.pages:
+        return 0.0
+    coverage = sum(1 for page in package.pages if page.text.strip()) / len(package.pages)
+    return max(0.1, 0.75 * coverage)
+
+
+def _warning(
+    asset: SourceAsset,
+    code: str,
+    message: str,
+    *,
+    details: dict[str, Any] | None = None,
+) -> Diagnostic:
+    return Diagnostic(
+        severity="warning",
+        code=code,
+        message=message,
+        source=SourceLocation(path=asset.path) if asset.path else None,
+        details=details or {},
+    )
+
+
+def _pdf_error(
+    asset: SourceAsset,
+    code: str,
+    message: str,
+    *,
+    details: dict[str, Any] | None = None,
+) -> Diagnostic:
+    return Diagnostic(
+        severity="error",
+        code=code,
+        message=message,
+        source=SourceLocation(path=asset.path) if asset.path else None,
+        details=details or {},
+    )
+
+
+def _malformed(
+    asset: SourceAsset,
+    message: str,
+    details: dict[str, Any] | None = None,
+) -> Diagnostic:
+    return _pdf_error(asset, "source.malformed", message, details=details)
+
+
+def _warning_count(diagnostics: list[Diagnostic]) -> int:
+    return sum(1 for diagnostic in diagnostics if diagnostic.severity == "warning")
+
+
+def _clean_text(text: str) -> str:
+    cleaned = re.sub(r"\s+", " ", text).strip()
+    return re.sub(r"\s+([.,;:!?])", r"\1", cleaned)