generated from coulomb/repo-seed
default source-location identity and opt-in content-digest identity for file move/rename reconciliation, PDF/DOCX-style placeholder ingestion
This commit is contained in:
@@ -1,5 +1,7 @@
|
||||
"""Built-in baseline format extractors."""
|
||||
|
||||
from .datasets import CsvDatasetExtractor
|
||||
from .documents import DocumentPlaceholderExtractor
|
||||
from .text import PlainTextExtractor
|
||||
|
||||
__all__ = ["PlainTextExtractor"]
|
||||
__all__ = ["CsvDatasetExtractor", "DocumentPlaceholderExtractor", "PlainTextExtractor"]
|
||||
|
||||
@@ -0,0 +1,79 @@
|
||||
"""Structured dataset baseline extractors."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
from typing import Any
|
||||
|
||||
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
|
||||
|
||||
|
||||
class CsvDatasetExtractor:
|
||||
name = "csv-dataset"
|
||||
media_types = ("text/csv", "application/csv", "text/tab-separated-values")
|
||||
|
||||
def capabilities(self) -> ExtractorCapability:
|
||||
return ExtractorCapability(
|
||||
extractor_name=self.name,
|
||||
media_types=self.media_types,
|
||||
extraction_depth="structure",
|
||||
produces_structure=True,
|
||||
metadata={"formats": ["csv", "tsv"]},
|
||||
)
|
||||
|
||||
def supports(self, media_type: str) -> bool:
|
||||
return media_type in self.media_types or media_type.startswith("text/csv")
|
||||
|
||||
def extract(self, payload: SourcePayload) -> ExtractionResult:
|
||||
text = payload.read_text("utf-8-sig")
|
||||
delimiter = _delimiter_for(payload)
|
||||
reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
|
||||
columns = list(reader.fieldnames or [])
|
||||
rows = [dict(row) for row in reader]
|
||||
table = {
|
||||
"name": payload.title,
|
||||
"columns": columns,
|
||||
"rows": rows,
|
||||
"row_count": len(rows),
|
||||
}
|
||||
metadata: dict[str, Any] = {
|
||||
"extractor": self.name,
|
||||
"dataset_format": "tsv" if delimiter == "\t" else "csv",
|
||||
"columns": columns,
|
||||
"column_count": len(columns),
|
||||
"row_count": len(rows),
|
||||
"table_count": 1,
|
||||
"source_digest": payload.content_digest,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
}
|
||||
normalized = NormalizedDocument(
|
||||
title=payload.title,
|
||||
text=text,
|
||||
structure={
|
||||
"kind": "dataset",
|
||||
"format": metadata["dataset_format"],
|
||||
"columns": columns,
|
||||
"row_count": len(rows),
|
||||
},
|
||||
tables=[table],
|
||||
fields={
|
||||
"columns": columns,
|
||||
"column_count": len(columns),
|
||||
"row_count": len(rows),
|
||||
"dataset_format": metadata["dataset_format"],
|
||||
},
|
||||
confidence=0.95,
|
||||
extractor_metadata={
|
||||
"extractor": self.name,
|
||||
"source_media_type": payload.media_type,
|
||||
},
|
||||
)
|
||||
return ExtractionResult(normalized=normalized, metadata=metadata)
|
||||
|
||||
|
||||
def _delimiter_for(payload: SourcePayload) -> str:
|
||||
filename = str(payload.metadata.get("filename", "")).lower()
|
||||
if payload.media_type == "text/tab-separated-values" or filename.endswith(".tsv"):
|
||||
return "\t"
|
||||
return ","
|
||||
@@ -0,0 +1,89 @@
|
||||
"""Metadata-only document placeholder extractors."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from kontextual_engine.core import (
|
||||
ExtractionResult,
|
||||
ExtractorCapability,
|
||||
IngestionFailure,
|
||||
NormalizedDocument,
|
||||
SourcePayload,
|
||||
)
|
||||
|
||||
|
||||
class DocumentPlaceholderExtractor:
|
||||
"""Represent binary document formats until optional deep extractors exist."""
|
||||
|
||||
name = "document-placeholder"
|
||||
media_types = (
|
||||
"application/pdf",
|
||||
"application/msword",
|
||||
"application/rtf",
|
||||
"application/vnd.ms-excel",
|
||||
"application/vnd.ms-powerpoint",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
)
|
||||
|
||||
def capabilities(self) -> ExtractorCapability:
|
||||
return ExtractorCapability(
|
||||
extractor_name=self.name,
|
||||
media_types=self.media_types,
|
||||
extraction_depth="metadata_only",
|
||||
produces_structure=False,
|
||||
metadata={
|
||||
"placeholder": True,
|
||||
"requires_optional_deep_extractor": True,
|
||||
},
|
||||
)
|
||||
|
||||
def supports(self, media_type: str) -> bool:
|
||||
return media_type in self.media_types
|
||||
|
||||
def extract(self, payload: SourcePayload) -> ExtractionResult:
|
||||
document_kind = "pdf" if payload.media_type == "application/pdf" else "office_document"
|
||||
unsupported = {
|
||||
"kind": document_kind,
|
||||
"media_type": payload.media_type,
|
||||
"reason": "deep_extraction_not_available",
|
||||
}
|
||||
diagnostic = IngestionFailure(
|
||||
code="extraction.depth_unsupported",
|
||||
message="Deep extraction for this document format requires an optional adapter",
|
||||
retriable=False,
|
||||
details={
|
||||
"extractor": self.name,
|
||||
"media_type": payload.media_type,
|
||||
"supported_depth": "metadata_only",
|
||||
},
|
||||
)
|
||||
metadata = {
|
||||
"extractor": self.name,
|
||||
"document_kind": document_kind,
|
||||
"extraction_depth": "metadata_only",
|
||||
"unsupported_elements": [unsupported],
|
||||
"source_digest": payload.content_digest,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
}
|
||||
normalized = NormalizedDocument(
|
||||
title=payload.title,
|
||||
text="",
|
||||
structure={
|
||||
"kind": document_kind,
|
||||
"extraction_depth": "metadata_only",
|
||||
},
|
||||
fields={
|
||||
"document_kind": document_kind,
|
||||
"source_media_type": payload.media_type,
|
||||
"source_size_bytes": payload.size_bytes,
|
||||
},
|
||||
confidence=0.0,
|
||||
unsupported_elements=[unsupported],
|
||||
extractor_metadata={
|
||||
"extractor": self.name,
|
||||
"source_media_type": payload.media_type,
|
||||
"extraction_depth": "metadata_only",
|
||||
},
|
||||
)
|
||||
return ExtractionResult(normalized=normalized, metadata=metadata, diagnostics=(diagnostic,))
|
||||
@@ -65,6 +65,24 @@ def _guess_media_type(path: Path) -> str:
|
||||
return "text/markdown"
|
||||
if suffix in {".txt", ".text", ".log"}:
|
||||
return "text/plain"
|
||||
if suffix == ".csv":
|
||||
return "text/csv"
|
||||
if suffix == ".tsv":
|
||||
return "text/tab-separated-values"
|
||||
if suffix == ".pdf":
|
||||
return "application/pdf"
|
||||
if suffix == ".doc":
|
||||
return "application/msword"
|
||||
if suffix == ".docx":
|
||||
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
if suffix == ".xls":
|
||||
return "application/vnd.ms-excel"
|
||||
if suffix == ".xlsx":
|
||||
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
if suffix == ".ppt":
|
||||
return "application/vnd.ms-powerpoint"
|
||||
if suffix == ".pptx":
|
||||
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
guessed, _ = mimetypes.guess_type(path.name)
|
||||
return guessed or "application/octet-stream"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user