default source-location identity and opt-in content-digest identity for file move/rename reconciliation, PDF/DOCX-style placeholder ingestion

This commit is contained in:
2026-05-06 13:04:36 +02:00
parent 48dffedc09
commit a4a4759ac4
13 changed files with 724 additions and 39 deletions

View File

@@ -1,5 +1,7 @@
"""Built-in baseline format extractors."""
from .datasets import CsvDatasetExtractor
from .documents import DocumentPlaceholderExtractor
from .text import PlainTextExtractor
__all__ = ["PlainTextExtractor"]
__all__ = ["CsvDatasetExtractor", "DocumentPlaceholderExtractor", "PlainTextExtractor"]

View File

@@ -0,0 +1,79 @@
"""Structured dataset baseline extractors."""
from __future__ import annotations
import csv
import io
from typing import Any
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
class CsvDatasetExtractor:
name = "csv-dataset"
media_types = ("text/csv", "application/csv", "text/tab-separated-values")
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="structure",
produces_structure=True,
metadata={"formats": ["csv", "tsv"]},
)
def supports(self, media_type: str) -> bool:
return media_type in self.media_types or media_type.startswith("text/csv")
def extract(self, payload: SourcePayload) -> ExtractionResult:
text = payload.read_text("utf-8-sig")
delimiter = _delimiter_for(payload)
reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
columns = list(reader.fieldnames or [])
rows = [dict(row) for row in reader]
table = {
"name": payload.title,
"columns": columns,
"rows": rows,
"row_count": len(rows),
}
metadata: dict[str, Any] = {
"extractor": self.name,
"dataset_format": "tsv" if delimiter == "\t" else "csv",
"columns": columns,
"column_count": len(columns),
"row_count": len(rows),
"table_count": 1,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
}
normalized = NormalizedDocument(
title=payload.title,
text=text,
structure={
"kind": "dataset",
"format": metadata["dataset_format"],
"columns": columns,
"row_count": len(rows),
},
tables=[table],
fields={
"columns": columns,
"column_count": len(columns),
"row_count": len(rows),
"dataset_format": metadata["dataset_format"],
},
confidence=0.95,
extractor_metadata={
"extractor": self.name,
"source_media_type": payload.media_type,
},
)
return ExtractionResult(normalized=normalized, metadata=metadata)
def _delimiter_for(payload: SourcePayload) -> str:
filename = str(payload.metadata.get("filename", "")).lower()
if payload.media_type == "text/tab-separated-values" or filename.endswith(".tsv"):
return "\t"
return ","

View File

@@ -0,0 +1,89 @@
"""Metadata-only document placeholder extractors."""
from __future__ import annotations
from kontextual_engine.core import (
ExtractionResult,
ExtractorCapability,
IngestionFailure,
NormalizedDocument,
SourcePayload,
)
class DocumentPlaceholderExtractor:
"""Represent binary document formats until optional deep extractors exist."""
name = "document-placeholder"
media_types = (
"application/pdf",
"application/msword",
"application/rtf",
"application/vnd.ms-excel",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="metadata_only",
produces_structure=False,
metadata={
"placeholder": True,
"requires_optional_deep_extractor": True,
},
)
def supports(self, media_type: str) -> bool:
return media_type in self.media_types
def extract(self, payload: SourcePayload) -> ExtractionResult:
document_kind = "pdf" if payload.media_type == "application/pdf" else "office_document"
unsupported = {
"kind": document_kind,
"media_type": payload.media_type,
"reason": "deep_extraction_not_available",
}
diagnostic = IngestionFailure(
code="extraction.depth_unsupported",
message="Deep extraction for this document format requires an optional adapter",
retriable=False,
details={
"extractor": self.name,
"media_type": payload.media_type,
"supported_depth": "metadata_only",
},
)
metadata = {
"extractor": self.name,
"document_kind": document_kind,
"extraction_depth": "metadata_only",
"unsupported_elements": [unsupported],
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
}
normalized = NormalizedDocument(
title=payload.title,
text="",
structure={
"kind": document_kind,
"extraction_depth": "metadata_only",
},
fields={
"document_kind": document_kind,
"source_media_type": payload.media_type,
"source_size_bytes": payload.size_bytes,
},
confidence=0.0,
unsupported_elements=[unsupported],
extractor_metadata={
"extractor": self.name,
"source_media_type": payload.media_type,
"extraction_depth": "metadata_only",
},
)
return ExtractionResult(normalized=normalized, metadata=metadata, diagnostics=(diagnostic,))

View File

@@ -65,6 +65,24 @@ def _guess_media_type(path: Path) -> str:
return "text/markdown"
if suffix in {".txt", ".text", ".log"}:
return "text/plain"
if suffix == ".csv":
return "text/csv"
if suffix == ".tsv":
return "text/tab-separated-values"
if suffix == ".pdf":
return "application/pdf"
if suffix == ".doc":
return "application/msword"
if suffix == ".docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
if suffix == ".xls":
return "application/vnd.ms-excel"
if suffix == ".xlsx":
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
if suffix == ".ppt":
return "application/vnd.ms-powerpoint"
if suffix == ".pptx":
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
guessed, _ = mimetypes.guess_type(path.name)
return guessed or "application/octet-stream"