generated from coulomb/repo-seed
feat(source): add pdf read adapter
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
"""Concrete source-format adapters for Markitect."""
|
||||
|
||||
from markitect_filter.adapters import epub3_adapter_descriptor
|
||||
from markitect_filter.adapters import epub3_adapter_descriptor, pdf_adapter_descriptor
|
||||
|
||||
__all__ = ["epub3_adapter_descriptor"]
|
||||
__all__ = ["epub3_adapter_descriptor", "pdf_adapter_descriptor"]
|
||||
|
||||
@@ -49,3 +49,59 @@ def epub3_adapter_descriptor() -> SourceAdapterDescriptor:
|
||||
"dependency_profile": "stdlib",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def pdf_adapter_descriptor() -> SourceAdapterDescriptor:
|
||||
"""Return the lightweight PDF read adapter descriptor."""
|
||||
|
||||
def factory():
|
||||
from markitect_filter.pdf import PdfReadAdapter
|
||||
|
||||
return PdfReadAdapter()
|
||||
|
||||
return SourceAdapterDescriptor(
|
||||
id="source.pdf",
|
||||
version="1",
|
||||
name="PDF",
|
||||
summary="Read digitally-readable PDFs into canonical Markitect Markdown.",
|
||||
operations=["read"],
|
||||
media_types=["application/pdf"],
|
||||
extensions=[".pdf"],
|
||||
factory=factory,
|
||||
option_schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"page_range": {
|
||||
"type": "string",
|
||||
"description": "Optional 1-based page range such as `1-3,5`.",
|
||||
},
|
||||
"include_page_breaks": {
|
||||
"type": "boolean",
|
||||
"default": False,
|
||||
"description": "Prefix each page segment with a Markdown comment page marker.",
|
||||
},
|
||||
"normalize_whitespace": {
|
||||
"type": "boolean",
|
||||
"default": True,
|
||||
"description": "Collapse repeated horizontal whitespace while preserving extracted line breaks.",
|
||||
},
|
||||
},
|
||||
"additionalProperties": False,
|
||||
},
|
||||
safety={
|
||||
"reads_files": True,
|
||||
"writes_files": False,
|
||||
"network": False,
|
||||
"external_process": False,
|
||||
},
|
||||
quality_profile={
|
||||
"text_extraction": "stdlib-pdf-text",
|
||||
"images": "diagnostic-only",
|
||||
"styles": "ignored",
|
||||
"tables": "plain-text-only",
|
||||
},
|
||||
metadata={
|
||||
"format": "PDF",
|
||||
"dependency_profile": "stdlib",
|
||||
},
|
||||
)
|
||||
|
||||
782
src/markitect_filter/pdf.py
Normal file
782
src/markitect_filter/pdf.py
Normal file
@@ -0,0 +1,782 @@
|
||||
"""PDF read adapter implementation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import zlib
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from markitect_tool.diagnostics import Diagnostic, SourceLocation, has_error
|
||||
from markitect_tool.source import (
|
||||
NormalizationQuality,
|
||||
NormalizedMarkdownDocument,
|
||||
NormalizedMarkdownSegment,
|
||||
SourceAdapterMatch,
|
||||
SourceAdapterMatchRequest,
|
||||
SourceAsset,
|
||||
SourceInspectRequest,
|
||||
SourceInspectResult,
|
||||
SourceMetadata,
|
||||
SourceProvenance,
|
||||
SourceReadRequest,
|
||||
SourceReadResult,
|
||||
normalization_cache_key,
|
||||
)
|
||||
|
||||
from markitect_filter.adapters import pdf_adapter_descriptor
|
||||
|
||||
|
||||
PDF_HEADER_RE = re.compile(rb"%PDF-\d\.\d")
|
||||
OBJECT_RE = re.compile(rb"(\d+)\s+(\d+)\s+obj\b(.*?)\bendobj", re.DOTALL)
|
||||
STREAM_RE = re.compile(rb"stream\r?\n(.*?)\r?\nendstream", re.DOTALL)
|
||||
PAGE_TYPE_RE = re.compile(rb"/Type\s*/Page\b")
|
||||
PAGES_TYPE_RE = re.compile(rb"/Type\s*/Pages\b")
|
||||
REF_RE = re.compile(rb"(\d+)\s+\d+\s+R")
|
||||
INFO_REF_RE = re.compile(rb"/Info\s+(\d+)\s+\d+\s+R")
|
||||
COUNT_RE = re.compile(rb"/Count\s+(\d+)")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PdfPage:
|
||||
number: int
|
||||
object_id: int
|
||||
text: str
|
||||
diagnostics: list[Diagnostic]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PdfPackage:
|
||||
metadata: SourceMetadata
|
||||
page_count: int
|
||||
encrypted: bool
|
||||
pages: list[PdfPage]
|
||||
diagnostics: list[Diagnostic]
|
||||
|
||||
|
||||
class PdfReadAdapter:
|
||||
"""Read digitally-readable PDFs into normalized Markitect Markdown."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.descriptor = pdf_adapter_descriptor()
|
||||
|
||||
def can_read(self, request: SourceAdapterMatchRequest) -> SourceAdapterMatch:
|
||||
asset = request.asset
|
||||
if asset.media_type == "application/pdf":
|
||||
return SourceAdapterMatch(
|
||||
adapter_id=self.descriptor.id,
|
||||
matched=True,
|
||||
confidence=100,
|
||||
reason="media_type",
|
||||
)
|
||||
if asset.extension == ".pdf":
|
||||
return SourceAdapterMatch(
|
||||
adapter_id=self.descriptor.id,
|
||||
matched=True,
|
||||
confidence=80,
|
||||
reason="extension",
|
||||
)
|
||||
return SourceAdapterMatch(
|
||||
adapter_id=self.descriptor.id,
|
||||
matched=False,
|
||||
confidence=0,
|
||||
reason="unsupported",
|
||||
)
|
||||
|
||||
def inspect(self, request: SourceInspectRequest) -> SourceInspectResult:
|
||||
package = _load_pdf(request.asset)
|
||||
diagnostics = package.diagnostics
|
||||
extracted_pages = sum(1 for page in package.pages if page.text.strip())
|
||||
return SourceInspectResult(
|
||||
asset=request.asset,
|
||||
adapter=_adapter_info(request.options),
|
||||
metadata=package.metadata,
|
||||
capabilities=["read"],
|
||||
quality=NormalizationQuality(
|
||||
lossiness="unknown" if has_error(diagnostics) else "medium",
|
||||
confidence=_confidence(package, diagnostics),
|
||||
warnings=_warning_count(diagnostics),
|
||||
metadata={
|
||||
"extraction": "pdf-stdlib-text",
|
||||
"page_count": package.page_count,
|
||||
"pages_with_text": extracted_pages,
|
||||
"encrypted": package.encrypted,
|
||||
},
|
||||
),
|
||||
diagnostics=diagnostics,
|
||||
valid=not has_error(diagnostics),
|
||||
)
|
||||
|
||||
def read(self, request: SourceReadRequest) -> SourceReadResult:
|
||||
package = _load_pdf(request.asset)
|
||||
if has_error(package.diagnostics):
|
||||
return SourceReadResult(diagnostics=package.diagnostics, valid=False)
|
||||
|
||||
selected_pages, page_range_diagnostics = _select_pages(
|
||||
package.pages,
|
||||
request.options.get("page_range"),
|
||||
request.asset,
|
||||
)
|
||||
diagnostics = [*package.diagnostics, *page_range_diagnostics]
|
||||
if has_error(diagnostics):
|
||||
return SourceReadResult(diagnostics=diagnostics, valid=False)
|
||||
|
||||
normalize_whitespace = bool(request.options.get("normalize_whitespace", True))
|
||||
include_page_breaks = bool(request.options.get("include_page_breaks", False))
|
||||
segments: list[NormalizedMarkdownSegment] = []
|
||||
skipped_pages = 0
|
||||
for page in selected_pages:
|
||||
page_text = _normalize_markdown(page.text, normalize_whitespace)
|
||||
diagnostics.extend(page.diagnostics)
|
||||
if not page_text:
|
||||
skipped_pages += 1
|
||||
diagnostics.append(
|
||||
_warning(
|
||||
request.asset,
|
||||
"source.pdf.empty_page",
|
||||
f"PDF page {page.number} did not produce extractable text.",
|
||||
details={"page": page.number},
|
||||
)
|
||||
)
|
||||
continue
|
||||
markdown = (
|
||||
f"<!-- page: {page.number} -->\n\n{page_text}"
|
||||
if include_page_breaks
|
||||
else page_text
|
||||
)
|
||||
segments.append(
|
||||
NormalizedMarkdownSegment(
|
||||
segment_id=f"page-{page.number:04d}",
|
||||
order=len(segments),
|
||||
markdown=markdown,
|
||||
provenance=[
|
||||
SourceProvenance(
|
||||
source_uri=request.asset.uri,
|
||||
source_path=request.asset.path,
|
||||
page=str(page.number),
|
||||
digest=request.asset.digest,
|
||||
metadata={"pdf_object": page.object_id},
|
||||
)
|
||||
],
|
||||
metadata={"page": page.number, "pdf_object": page.object_id},
|
||||
)
|
||||
)
|
||||
|
||||
if not segments:
|
||||
diagnostics.append(
|
||||
_pdf_error(
|
||||
request.asset,
|
||||
"source.pdf.no_extractable_text",
|
||||
"PDF did not produce any extractable Markdown text.",
|
||||
details={"page_count": package.page_count},
|
||||
)
|
||||
)
|
||||
return SourceReadResult(diagnostics=diagnostics, valid=False)
|
||||
|
||||
markdown = "\n\n".join(segment.markdown.strip() for segment in segments)
|
||||
page_coverage = len(segments) / max(len(selected_pages), 1)
|
||||
warning_count = _warning_count(diagnostics)
|
||||
quality = NormalizationQuality(
|
||||
lossiness="medium" if warning_count else "low",
|
||||
confidence=max(0.1, 0.75 * page_coverage) if not has_error(diagnostics) else 0.0,
|
||||
skipped_items=skipped_pages,
|
||||
warnings=warning_count,
|
||||
metadata={
|
||||
"extraction": "pdf-stdlib-text",
|
||||
"page_count": package.page_count,
|
||||
"selected_pages": [page.number for page in selected_pages],
|
||||
"pages_extracted": len(segments),
|
||||
"page_coverage": page_coverage,
|
||||
},
|
||||
)
|
||||
document = NormalizedMarkdownDocument(
|
||||
document_id=_document_id(request.asset, package.metadata),
|
||||
asset=request.asset,
|
||||
metadata=package.metadata,
|
||||
markdown=markdown,
|
||||
segments=segments,
|
||||
quality=quality,
|
||||
diagnostics=diagnostics,
|
||||
provenance=[
|
||||
SourceProvenance(
|
||||
source_uri=request.asset.uri,
|
||||
source_path=request.asset.path,
|
||||
digest=request.asset.digest,
|
||||
metadata={"page_count": package.page_count},
|
||||
)
|
||||
],
|
||||
adapter=_adapter_info(request.options),
|
||||
cache_key=normalization_cache_key(
|
||||
asset=request.asset,
|
||||
adapter_id=self.descriptor.id,
|
||||
adapter_version=self.descriptor.version,
|
||||
options=request.options,
|
||||
),
|
||||
)
|
||||
return SourceReadResult(document=document, diagnostics=[], valid=not has_error(diagnostics))
|
||||
|
||||
|
||||
def _load_pdf(asset: SourceAsset) -> PdfPackage:
|
||||
diagnostics: list[Diagnostic] = []
|
||||
try:
|
||||
data = Path(asset.path or asset.uri).read_bytes()
|
||||
except OSError as exc:
|
||||
return PdfPackage(
|
||||
metadata=SourceMetadata(),
|
||||
page_count=0,
|
||||
encrypted=False,
|
||||
pages=[],
|
||||
diagnostics=[
|
||||
_pdf_error(
|
||||
asset,
|
||||
"source.pdf.unreadable",
|
||||
"PDF file could not be read.",
|
||||
details={"error": str(exc)},
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
if not PDF_HEADER_RE.match(data.lstrip()):
|
||||
return PdfPackage(
|
||||
metadata=SourceMetadata(),
|
||||
page_count=0,
|
||||
encrypted=False,
|
||||
pages=[],
|
||||
diagnostics=[_malformed(asset, "PDF does not start with a PDF header.")],
|
||||
)
|
||||
|
||||
objects = _parse_objects(data)
|
||||
encrypted = bool(re.search(rb"/Encrypt\b", data))
|
||||
metadata = _extract_metadata(data, objects, encrypted)
|
||||
if encrypted:
|
||||
return PdfPackage(
|
||||
metadata=metadata,
|
||||
page_count=_page_count(objects),
|
||||
encrypted=True,
|
||||
pages=[],
|
||||
diagnostics=[
|
||||
_pdf_error(
|
||||
asset,
|
||||
"source.pdf.encrypted",
|
||||
"PDF is encrypted or declares an encryption dictionary.",
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
page_ids = _page_object_ids(objects)
|
||||
page_count = _page_count(objects) or len(page_ids)
|
||||
pages: list[PdfPage] = []
|
||||
for page_number, object_id in enumerate(page_ids, start=1):
|
||||
page_body = objects[object_id]
|
||||
page_diagnostics: list[Diagnostic] = []
|
||||
content_ids = _content_refs(page_body)
|
||||
text_parts: list[str] = []
|
||||
if not content_ids and STREAM_RE.search(page_body):
|
||||
stream = _stream_data(page_body, asset, page_diagnostics)
|
||||
if stream:
|
||||
text_parts.append(_extract_stream_text(stream))
|
||||
for content_id in content_ids:
|
||||
content_body = objects.get(content_id)
|
||||
if content_body is None:
|
||||
page_diagnostics.append(
|
||||
_warning(
|
||||
asset,
|
||||
"source.pdf.missing_content_stream",
|
||||
f"PDF page {page_number} references missing content object {content_id}.",
|
||||
details={"page": page_number, "object_id": content_id},
|
||||
)
|
||||
)
|
||||
continue
|
||||
stream = _stream_data(content_body, asset, page_diagnostics)
|
||||
if stream:
|
||||
text_parts.append(_extract_stream_text(stream))
|
||||
text = "\n".join(part for part in text_parts if part.strip()).strip()
|
||||
if _page_may_be_image_only(page_body, objects, content_ids) and not text:
|
||||
page_diagnostics.append(
|
||||
_warning(
|
||||
asset,
|
||||
"source.pdf.image_only_page",
|
||||
f"PDF page {page_number} appears to contain image content without extractable text.",
|
||||
details={"page": page_number},
|
||||
)
|
||||
)
|
||||
pages.append(PdfPage(page_number, object_id, text, page_diagnostics))
|
||||
|
||||
if not page_ids:
|
||||
diagnostics.append(_malformed(asset, "PDF does not declare any page objects."))
|
||||
|
||||
return PdfPackage(
|
||||
metadata=metadata,
|
||||
page_count=page_count,
|
||||
encrypted=False,
|
||||
pages=pages,
|
||||
diagnostics=diagnostics,
|
||||
)
|
||||
|
||||
|
||||
def _parse_objects(data: bytes) -> dict[int, bytes]:
|
||||
return {int(match.group(1)): match.group(3) for match in OBJECT_RE.finditer(data)}
|
||||
|
||||
|
||||
def _page_object_ids(objects: dict[int, bytes]) -> list[int]:
|
||||
return [
|
||||
object_id
|
||||
for object_id, body in sorted(objects.items())
|
||||
if PAGE_TYPE_RE.search(body) and not PAGES_TYPE_RE.search(body)
|
||||
]
|
||||
|
||||
|
||||
def _page_count(objects: dict[int, bytes]) -> int:
|
||||
counts = [
|
||||
int(match.group(1))
|
||||
for body in objects.values()
|
||||
if PAGES_TYPE_RE.search(body)
|
||||
for match in [COUNT_RE.search(body)]
|
||||
if match is not None
|
||||
]
|
||||
return max(counts) if counts else 0
|
||||
|
||||
|
||||
def _content_refs(page_body: bytes) -> list[int]:
|
||||
refs: list[int] = []
|
||||
array_match = re.search(rb"/Contents\s*\[(.*?)\]", page_body, re.DOTALL)
|
||||
if array_match:
|
||||
refs.extend(int(match.group(1)) for match in REF_RE.finditer(array_match.group(1)))
|
||||
direct_match = re.search(rb"/Contents\s+(\d+)\s+\d+\s+R", page_body)
|
||||
if direct_match:
|
||||
ref = int(direct_match.group(1))
|
||||
if ref not in refs:
|
||||
refs.append(ref)
|
||||
return refs
|
||||
|
||||
|
||||
def _stream_data(
|
||||
object_body: bytes,
|
||||
asset: SourceAsset,
|
||||
diagnostics: list[Diagnostic],
|
||||
) -> bytes:
|
||||
match = STREAM_RE.search(object_body)
|
||||
if match is None:
|
||||
diagnostics.append(
|
||||
_warning(
|
||||
asset,
|
||||
"source.pdf.missing_stream",
|
||||
"PDF content object does not contain a readable stream.",
|
||||
)
|
||||
)
|
||||
return b""
|
||||
stream = match.group(1)
|
||||
if b"/FlateDecode" in object_body:
|
||||
try:
|
||||
return zlib.decompress(stream)
|
||||
except zlib.error as exc:
|
||||
diagnostics.append(
|
||||
_warning(
|
||||
asset,
|
||||
"source.pdf.flate_decode_failed",
|
||||
"PDF FlateDecode stream could not be decompressed.",
|
||||
details={"error": str(exc)},
|
||||
)
|
||||
)
|
||||
return b""
|
||||
return stream
|
||||
|
||||
|
||||
def _extract_metadata(
|
||||
data: bytes,
|
||||
objects: dict[int, bytes],
|
||||
encrypted: bool,
|
||||
) -> SourceMetadata:
|
||||
info_body = b""
|
||||
info_ref = INFO_REF_RE.search(data)
|
||||
if info_ref:
|
||||
info_body = objects.get(int(info_ref.group(1)), b"")
|
||||
if not info_body:
|
||||
info_body = data
|
||||
|
||||
title = _metadata_value(info_body, "Title")
|
||||
author = _metadata_value(info_body, "Author")
|
||||
creation_date = _metadata_value(info_body, "CreationDate")
|
||||
modification_date = _metadata_value(info_body, "ModDate")
|
||||
raw = {
|
||||
key: value
|
||||
for key, value in {
|
||||
"subject": _metadata_value(info_body, "Subject"),
|
||||
"keywords": _metadata_value(info_body, "Keywords"),
|
||||
"producer": _metadata_value(info_body, "Producer"),
|
||||
"creator": _metadata_value(info_body, "Creator"),
|
||||
"creation_date": creation_date,
|
||||
"modification_date": modification_date,
|
||||
"encrypted": encrypted,
|
||||
}.items()
|
||||
if value not in (None, "")
|
||||
}
|
||||
return SourceMetadata(
|
||||
title=title,
|
||||
creators=[author] if author else [],
|
||||
publication_date=creation_date,
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
|
||||
def _metadata_value(body: bytes, key: str) -> str | None:
|
||||
key_bytes = f"/{key}".encode("ascii")
|
||||
index = body.find(key_bytes)
|
||||
if index < 0:
|
||||
return None
|
||||
index += len(key_bytes)
|
||||
while index < len(body) and body[index] in b" \t\r\n":
|
||||
index += 1
|
||||
if index >= len(body):
|
||||
return None
|
||||
if body[index:index + 1] == b"(":
|
||||
value, _ = _read_literal_string(body, index)
|
||||
return _clean_text(value)
|
||||
if body[index:index + 1] == b"<" and body[index:index + 2] != b"<<":
|
||||
end = body.find(b">", index + 1)
|
||||
if end > index:
|
||||
return _clean_text(_decode_hex(body[index + 1:end]))
|
||||
return None
|
||||
|
||||
|
||||
def _extract_stream_text(stream: bytes) -> str:
|
||||
tokens = _pdf_tokens(stream)
|
||||
lines = [""]
|
||||
last_array_start: int | None = None
|
||||
for index, token in enumerate(tokens):
|
||||
if token == "[":
|
||||
last_array_start = index
|
||||
continue
|
||||
if token == "Tj":
|
||||
_append_text(lines, _previous_text(tokens, index))
|
||||
elif token == "TJ":
|
||||
if last_array_start is not None:
|
||||
text = "".join(
|
||||
item[1]
|
||||
for item in tokens[last_array_start:index]
|
||||
if isinstance(item, tuple) and item[0] == "text"
|
||||
)
|
||||
_append_text(lines, text)
|
||||
last_array_start = None
|
||||
elif token == "'":
|
||||
_new_line(lines)
|
||||
_append_text(lines, _previous_text(tokens, index))
|
||||
elif token == '"':
|
||||
_new_line(lines)
|
||||
_append_text(lines, _previous_text(tokens, index))
|
||||
elif token in {"T*", "TD"}:
|
||||
_new_line(lines)
|
||||
elif token == "Td" and _td_moves_to_new_line(tokens, index):
|
||||
_new_line(lines)
|
||||
return "\n".join(line.rstrip() for line in lines).strip()
|
||||
|
||||
|
||||
def _pdf_tokens(stream: bytes) -> list[str | tuple[str, str]]:
|
||||
tokens: list[str | tuple[str, str]] = []
|
||||
index = 0
|
||||
while index < len(stream):
|
||||
char = stream[index:index + 1]
|
||||
if char in b" \t\r\n\f\x00":
|
||||
index += 1
|
||||
continue
|
||||
if char == b"%":
|
||||
newline = stream.find(b"\n", index)
|
||||
index = len(stream) if newline < 0 else newline + 1
|
||||
continue
|
||||
if char == b"(":
|
||||
text, index = _read_literal_string(stream, index)
|
||||
tokens.append(("text", text))
|
||||
continue
|
||||
if char == b"<" and stream[index:index + 2] != b"<<":
|
||||
end = stream.find(b">", index + 1)
|
||||
if end < 0:
|
||||
break
|
||||
tokens.append(("text", _decode_hex(stream[index + 1:end])))
|
||||
index = end + 1
|
||||
continue
|
||||
if char in b"[]":
|
||||
tokens.append(char.decode("ascii"))
|
||||
index += 1
|
||||
continue
|
||||
end = index
|
||||
while end < len(stream) and stream[end:end + 1] not in b" \t\r\n\f\x00[]()<>/%":
|
||||
end += 1
|
||||
if end == index:
|
||||
index += 1
|
||||
continue
|
||||
tokens.append(stream[index:end].decode("latin-1", errors="ignore"))
|
||||
index = end
|
||||
return tokens
|
||||
|
||||
|
||||
def _read_literal_string(data: bytes, start: int) -> tuple[str, int]:
|
||||
depth = 1
|
||||
index = start + 1
|
||||
output = bytearray()
|
||||
while index < len(data) and depth > 0:
|
||||
char = data[index]
|
||||
if char == 0x5C:
|
||||
escaped, index = _read_escape(data, index + 1)
|
||||
output.extend(escaped)
|
||||
continue
|
||||
if char == 0x28:
|
||||
depth += 1
|
||||
output.append(char)
|
||||
elif char == 0x29:
|
||||
depth -= 1
|
||||
if depth:
|
||||
output.append(char)
|
||||
else:
|
||||
output.append(char)
|
||||
index += 1
|
||||
return output.decode("utf-8", errors="replace"), index
|
||||
|
||||
|
||||
def _read_escape(data: bytes, index: int) -> tuple[bytes, int]:
|
||||
if index >= len(data):
|
||||
return b"\\", index
|
||||
char = data[index]
|
||||
escapes = {
|
||||
ord("n"): b"\n",
|
||||
ord("r"): b"\r",
|
||||
ord("t"): b"\t",
|
||||
ord("b"): b"\b",
|
||||
ord("f"): b"\f",
|
||||
ord("("): b"(",
|
||||
ord(")"): b")",
|
||||
ord("\\"): b"\\",
|
||||
}
|
||||
if char in escapes:
|
||||
return escapes[char], index + 1
|
||||
if char in b"\r\n":
|
||||
if char == ord("\r") and index + 1 < len(data) and data[index + 1] == ord("\n"):
|
||||
return b"", index + 2
|
||||
return b"", index + 1
|
||||
if 0x30 <= char <= 0x37:
|
||||
end = index + 1
|
||||
while end < min(index + 3, len(data)) and 0x30 <= data[end] <= 0x37:
|
||||
end += 1
|
||||
return bytes([int(data[index:end], 8)]), end
|
||||
return bytes([char]), index + 1
|
||||
|
||||
|
||||
def _decode_hex(value: bytes) -> str:
|
||||
cleaned = re.sub(rb"\s+", b"", value)
|
||||
if len(cleaned) % 2:
|
||||
cleaned += b"0"
|
||||
try:
|
||||
return bytes.fromhex(cleaned.decode("ascii")).decode("utf-8", errors="replace")
|
||||
except ValueError:
|
||||
return ""
|
||||
|
||||
|
||||
def _previous_text(tokens: list[str | tuple[str, str]], index: int) -> str:
|
||||
cursor = index - 1
|
||||
while cursor >= 0:
|
||||
token = tokens[cursor]
|
||||
if isinstance(token, tuple) and token[0] == "text":
|
||||
return token[1]
|
||||
if token in {"Tj", "TJ", "'", '"', "T*", "Td", "TD"}:
|
||||
break
|
||||
cursor -= 1
|
||||
return ""
|
||||
|
||||
|
||||
def _td_moves_to_new_line(tokens: list[str | tuple[str, str]], index: int) -> bool:
|
||||
if index < 2:
|
||||
return False
|
||||
y = tokens[index - 1]
|
||||
try:
|
||||
return isinstance(y, str) and float(y) < 0
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def _append_text(lines: list[str], text: str) -> None:
|
||||
if text:
|
||||
lines[-1] += text
|
||||
|
||||
|
||||
def _new_line(lines: list[str]) -> None:
|
||||
if lines[-1]:
|
||||
lines.append("")
|
||||
|
||||
|
||||
def _select_pages(
|
||||
pages: list[PdfPage],
|
||||
page_range: Any,
|
||||
asset: SourceAsset,
|
||||
) -> tuple[list[PdfPage], list[Diagnostic]]:
|
||||
if page_range in (None, ""):
|
||||
return pages, []
|
||||
selected_numbers, diagnostics = _parse_page_range(page_range, len(pages), asset)
|
||||
selected = [page for page in pages if page.number in selected_numbers]
|
||||
return selected, diagnostics
|
||||
|
||||
|
||||
def _parse_page_range(
|
||||
value: Any,
|
||||
page_count: int,
|
||||
asset: SourceAsset,
|
||||
) -> tuple[set[int], list[Diagnostic]]:
|
||||
diagnostics: list[Diagnostic] = []
|
||||
selected: set[int] = set()
|
||||
if isinstance(value, int):
|
||||
value = str(value)
|
||||
if isinstance(value, (list, tuple)):
|
||||
value = ",".join(str(item) for item in value)
|
||||
if not isinstance(value, str):
|
||||
return set(), [
|
||||
_pdf_error(
|
||||
asset,
|
||||
"source.pdf.invalid_page_range",
|
||||
"PDF page_range option must be a string, integer, or list of integers.",
|
||||
)
|
||||
]
|
||||
for part in value.split(","):
|
||||
part = part.strip()
|
||||
if not part:
|
||||
continue
|
||||
if "-" in part:
|
||||
start_text, end_text = part.split("-", 1)
|
||||
if not start_text.isdigit() or not end_text.isdigit():
|
||||
diagnostics.append(_invalid_page_range(asset, value))
|
||||
continue
|
||||
start, end = int(start_text), int(end_text)
|
||||
if start > end:
|
||||
diagnostics.append(_invalid_page_range(asset, value))
|
||||
continue
|
||||
selected.update(range(start, end + 1))
|
||||
elif part.isdigit():
|
||||
selected.add(int(part))
|
||||
else:
|
||||
diagnostics.append(_invalid_page_range(asset, value))
|
||||
out_of_range = sorted(page for page in selected if page < 1 or page > page_count)
|
||||
if out_of_range:
|
||||
diagnostics.append(
|
||||
_pdf_error(
|
||||
asset,
|
||||
"source.pdf.page_range_out_of_bounds",
|
||||
"PDF page_range selects pages outside the document.",
|
||||
details={"page_range": value, "page_count": page_count, "pages": out_of_range},
|
||||
)
|
||||
)
|
||||
selected = {page for page in selected if 1 <= page <= page_count}
|
||||
if not selected and not diagnostics:
|
||||
diagnostics.append(_invalid_page_range(asset, value))
|
||||
return selected, diagnostics
|
||||
|
||||
|
||||
def _invalid_page_range(asset: SourceAsset, value: str) -> Diagnostic:
|
||||
return _pdf_error(
|
||||
asset,
|
||||
"source.pdf.invalid_page_range",
|
||||
"PDF page_range option is invalid.",
|
||||
details={"page_range": value},
|
||||
)
|
||||
|
||||
|
||||
def _page_may_be_image_only(
|
||||
page_body: bytes,
|
||||
objects: dict[int, bytes],
|
||||
content_ids: list[int],
|
||||
) -> bool:
|
||||
haystack = page_body + b"\n" + b"\n".join(objects.get(ref, b"") for ref in content_ids)
|
||||
return bool(re.search(rb"/Subtype\s*/Image\b|\bDo\b", haystack))
|
||||
|
||||
|
||||
def _normalize_markdown(text: str, normalize_whitespace: bool) -> str:
|
||||
lines = text.replace("\r\n", "\n").replace("\r", "\n").split("\n")
|
||||
if normalize_whitespace:
|
||||
lines = [re.sub(r"[ \t]+", " ", line).strip() for line in lines]
|
||||
else:
|
||||
lines = [line.rstrip() for line in lines]
|
||||
cleaned: list[str] = []
|
||||
blank = False
|
||||
for line in lines:
|
||||
if not line:
|
||||
if not blank and cleaned:
|
||||
cleaned.append("")
|
||||
blank = True
|
||||
continue
|
||||
cleaned.append(line)
|
||||
blank = False
|
||||
return "\n".join(cleaned).strip()
|
||||
|
||||
|
||||
def _document_id(asset: SourceAsset, metadata: SourceMetadata) -> str:
|
||||
title = metadata.title
|
||||
if title:
|
||||
slug = re.sub(r"[^a-z0-9._-]+", "-", title.lower()).strip("-")
|
||||
if slug:
|
||||
return f"source.pdf:{slug}"
|
||||
if asset.digest:
|
||||
return f"source.pdf:{asset.digest.removeprefix('sha256:')}"
|
||||
return f"source.pdf:{asset.name or asset.uri}"
|
||||
|
||||
|
||||
def _adapter_info(options: dict[str, Any]) -> dict[str, Any]:
|
||||
return {
|
||||
"id": "source.pdf",
|
||||
"version": "1",
|
||||
"options": options,
|
||||
}
|
||||
|
||||
|
||||
def _confidence(package: PdfPackage, diagnostics: list[Diagnostic]) -> float:
|
||||
if has_error(diagnostics):
|
||||
return 0.0
|
||||
if not package.pages:
|
||||
return 0.0
|
||||
coverage = sum(1 for page in package.pages if page.text.strip()) / len(package.pages)
|
||||
return max(0.1, 0.75 * coverage)
|
||||
|
||||
|
||||
def _warning(
|
||||
asset: SourceAsset,
|
||||
code: str,
|
||||
message: str,
|
||||
*,
|
||||
details: dict[str, Any] | None = None,
|
||||
) -> Diagnostic:
|
||||
return Diagnostic(
|
||||
severity="warning",
|
||||
code=code,
|
||||
message=message,
|
||||
source=SourceLocation(path=asset.path) if asset.path else None,
|
||||
details=details or {},
|
||||
)
|
||||
|
||||
|
||||
def _pdf_error(
|
||||
asset: SourceAsset,
|
||||
code: str,
|
||||
message: str,
|
||||
*,
|
||||
details: dict[str, Any] | None = None,
|
||||
) -> Diagnostic:
|
||||
return Diagnostic(
|
||||
severity="error",
|
||||
code=code,
|
||||
message=message,
|
||||
source=SourceLocation(path=asset.path) if asset.path else None,
|
||||
details=details or {},
|
||||
)
|
||||
|
||||
|
||||
def _malformed(
|
||||
asset: SourceAsset,
|
||||
message: str,
|
||||
details: dict[str, Any] | None = None,
|
||||
) -> Diagnostic:
|
||||
return _pdf_error(asset, "source.malformed", message, details=details)
|
||||
|
||||
|
||||
def _warning_count(diagnostics: list[Diagnostic]) -> int:
|
||||
return sum(1 for diagnostic in diagnostics if diagnostic.severity == "warning")
|
||||
|
||||
|
||||
def _clean_text(text: str) -> str:
|
||||
cleaned = re.sub(r"\s+", " ", text).strip()
|
||||
return re.sub(r"\s+([.,;:!?])", r"\1", cleaned)
|
||||
Reference in New Issue
Block a user