default source-location identity and opt-in content-digest identity for file move/rename reconciliation, PDF/DOCX-style placeholder ingestion

This commit is contained in:
2026-05-06 13:04:36 +02:00
parent 48dffedc09
commit a4a4759ac4
13 changed files with 724 additions and 39 deletions

View File

@@ -32,6 +32,7 @@ from .core import (
IdempotencyRecord,
IdempotencyStatus,
IngestionFailure,
IngestionIdentityPolicy,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
@@ -137,6 +138,7 @@ __all__ = [
"IngestionResult",
"IngestionService",
"IngestionFailure",
"IngestionIdentityPolicy",
"IngestionJob",
"IngestionJobStatus",
"InputBundle",

View File

@@ -1,5 +1,7 @@
"""Built-in baseline format extractors."""
from .datasets import CsvDatasetExtractor
from .documents import DocumentPlaceholderExtractor
from .text import PlainTextExtractor
__all__ = ["PlainTextExtractor"]
__all__ = ["CsvDatasetExtractor", "DocumentPlaceholderExtractor", "PlainTextExtractor"]

View File

@@ -0,0 +1,79 @@
"""Structured dataset baseline extractors."""
from __future__ import annotations
import csv
import io
from typing import Any
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
class CsvDatasetExtractor:
name = "csv-dataset"
media_types = ("text/csv", "application/csv", "text/tab-separated-values")
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="structure",
produces_structure=True,
metadata={"formats": ["csv", "tsv"]},
)
def supports(self, media_type: str) -> bool:
return media_type in self.media_types or media_type.startswith("text/csv")
def extract(self, payload: SourcePayload) -> ExtractionResult:
text = payload.read_text("utf-8-sig")
delimiter = _delimiter_for(payload)
reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
columns = list(reader.fieldnames or [])
rows = [dict(row) for row in reader]
table = {
"name": payload.title,
"columns": columns,
"rows": rows,
"row_count": len(rows),
}
metadata: dict[str, Any] = {
"extractor": self.name,
"dataset_format": "tsv" if delimiter == "\t" else "csv",
"columns": columns,
"column_count": len(columns),
"row_count": len(rows),
"table_count": 1,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
}
normalized = NormalizedDocument(
title=payload.title,
text=text,
structure={
"kind": "dataset",
"format": metadata["dataset_format"],
"columns": columns,
"row_count": len(rows),
},
tables=[table],
fields={
"columns": columns,
"column_count": len(columns),
"row_count": len(rows),
"dataset_format": metadata["dataset_format"],
},
confidence=0.95,
extractor_metadata={
"extractor": self.name,
"source_media_type": payload.media_type,
},
)
return ExtractionResult(normalized=normalized, metadata=metadata)
def _delimiter_for(payload: SourcePayload) -> str:
filename = str(payload.metadata.get("filename", "")).lower()
if payload.media_type == "text/tab-separated-values" or filename.endswith(".tsv"):
return "\t"
return ","

View File

@@ -0,0 +1,89 @@
"""Metadata-only document placeholder extractors."""
from __future__ import annotations
from kontextual_engine.core import (
ExtractionResult,
ExtractorCapability,
IngestionFailure,
NormalizedDocument,
SourcePayload,
)
class DocumentPlaceholderExtractor:
"""Represent binary document formats until optional deep extractors exist."""
name = "document-placeholder"
media_types = (
"application/pdf",
"application/msword",
"application/rtf",
"application/vnd.ms-excel",
"application/vnd.ms-powerpoint",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
)
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="metadata_only",
produces_structure=False,
metadata={
"placeholder": True,
"requires_optional_deep_extractor": True,
},
)
def supports(self, media_type: str) -> bool:
return media_type in self.media_types
def extract(self, payload: SourcePayload) -> ExtractionResult:
document_kind = "pdf" if payload.media_type == "application/pdf" else "office_document"
unsupported = {
"kind": document_kind,
"media_type": payload.media_type,
"reason": "deep_extraction_not_available",
}
diagnostic = IngestionFailure(
code="extraction.depth_unsupported",
message="Deep extraction for this document format requires an optional adapter",
retriable=False,
details={
"extractor": self.name,
"media_type": payload.media_type,
"supported_depth": "metadata_only",
},
)
metadata = {
"extractor": self.name,
"document_kind": document_kind,
"extraction_depth": "metadata_only",
"unsupported_elements": [unsupported],
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
}
normalized = NormalizedDocument(
title=payload.title,
text="",
structure={
"kind": document_kind,
"extraction_depth": "metadata_only",
},
fields={
"document_kind": document_kind,
"source_media_type": payload.media_type,
"source_size_bytes": payload.size_bytes,
},
confidence=0.0,
unsupported_elements=[unsupported],
extractor_metadata={
"extractor": self.name,
"source_media_type": payload.media_type,
"extraction_depth": "metadata_only",
},
)
return ExtractionResult(normalized=normalized, metadata=metadata, diagnostics=(diagnostic,))

View File

@@ -65,6 +65,24 @@ def _guess_media_type(path: Path) -> str:
return "text/markdown"
if suffix in {".txt", ".text", ".log"}:
return "text/plain"
if suffix == ".csv":
return "text/csv"
if suffix == ".tsv":
return "text/tab-separated-values"
if suffix == ".pdf":
return "application/pdf"
if suffix == ".doc":
return "application/msword"
if suffix == ".docx":
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
if suffix == ".xls":
return "application/vnd.ms-excel"
if suffix == ".xlsx":
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
if suffix == ".ppt":
return "application/vnd.ms-powerpoint"
if suffix == ".pptx":
return "application/vnd.openxmlformats-officedocument.presentationml.presentation"
guessed, _ = mimetypes.guess_type(path.name)
return guessed or "application/octet-stream"

View File

@@ -9,6 +9,7 @@ from .ingestion import (
ExtractionResult,
ExtractorCapability,
IngestionFailure,
IngestionIdentityPolicy,
IngestionJob,
IngestionJobStatus,
NormalizedDocument,
@@ -58,6 +59,7 @@ __all__ = [
"IdempotencyRecord",
"IdempotencyStatus",
"IngestionFailure",
"IngestionIdentityPolicy",
"IngestionJob",
"IngestionJobStatus",
"KnowledgeAsset",

View File

@@ -21,6 +21,11 @@ class IngestionJobStatus(str, Enum):
CANCELED = "canceled"
class IngestionIdentityPolicy(str, Enum):
SOURCE_LOCATION = "source_location"
CONTENT_DIGEST = "content_digest"
@dataclass(frozen=True)
class IngestionFailure:
code: str

View File

@@ -345,6 +345,79 @@ class AssetRegistryService:
def list_metadata_schema_assignments(self) -> list[MetadataSchemaAssignment]:
return self.repository.list_metadata_schema_assignments()
def record_ingestion_update(
self,
asset_id: str,
source_ref: SourceReference,
representations: list[AssetRepresentation] | tuple[AssetRepresentation, ...],
metadata_records: list[MetadataRecord] | tuple[MetadataRecord, ...],
context: OperationContext,
*,
expected_current_version_id: str | None = None,
) -> AssetChangeResult:
asset = self.repository.get_asset(asset_id)
self._assert_expected_current_version(
asset,
expected_current_version_id,
operation="asset.ingest.update",
)
decision = self._authorize(
context,
"asset.ingest.update",
f"asset:{asset.id}",
resource_metadata={
"source_system": source_ref.source_system,
"source_path": source_ref.path or "",
"checksum": source_ref.checksum or "",
"representation_count": str(len(representations)),
"metadata_record_count": str(len(metadata_records)),
},
)
self._validate_metadata_records(
asset.classification,
self.repository.list_metadata_records(asset.id) + list(metadata_records),
)
updated = asset
if not _has_source_reference(updated, source_ref):
updated = updated.with_source_reference(source_ref)
alias = _source_alias(source_ref)
if alias:
updated = updated.with_alias(alias)
representation_ids: list[str] = []
for representation in representations:
if representation.asset_id != asset.id:
representation = replace(representation, asset_id=asset.id)
self.repository.save_representation(representation)
representation_ids.append(representation.representation_id)
for record in metadata_records:
self.repository.save_metadata_record(asset.id, record)
version = AssetVersion(
asset_id=asset.id,
sequence=self._next_sequence(asset.id),
change_type=VersionChangeType.CONTENT_CHANGED,
representation_ids=tuple(representation_ids),
actor_id=context.actor.id,
parent_version_id=asset.current_version_id,
metadata_delta={record.key: record.value for record in metadata_records},
lifecycle=updated.lifecycle.value,
)
updated = updated.with_current_version(version.version_id)
self.repository.save_asset(updated)
self.repository.save_version(version)
event = self._audit(
"asset.ingest.update",
f"asset:{asset.id}",
AuditOutcome.SUCCESS,
context,
decision,
details={
"source_ref_id": source_ref.id,
"version_id": version.version_id,
"representation_ids": tuple(representation_ids),
},
)
return AssetChangeResult(updated, version, event, decision)
def add_representation(
self,
asset_id: str,
@@ -881,3 +954,19 @@ def _remediation_for_error(error: KontextualError) -> str | None:
if isinstance(error, AuthorizationError):
return "Request policy approval or rerun with an actor that is authorized for this operation."
return None
def _has_source_reference(asset: KnowledgeAsset, source_ref: SourceReference) -> bool:
return any(
existing.identity_key == source_ref.identity_key
or (
existing.connector_ref is not None
and existing.connector_ref == source_ref.connector_ref
and existing.checksum == source_ref.checksum
)
for existing in asset.source_refs
)
def _source_alias(source_ref: SourceReference) -> str | None:
return source_ref.connector_ref or source_ref.path or source_ref.uri or source_ref.external_id

View File

@@ -6,13 +6,18 @@ from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
from kontextual_engine.adapters.builtin_extractors import PlainTextExtractor
from kontextual_engine.adapters.builtin_extractors import (
CsvDatasetExtractor,
DocumentPlaceholderExtractor,
PlainTextExtractor,
)
from kontextual_engine.adapters.local_files import LocalFileConnector
from kontextual_engine.adapters.markitect_tool import MarkitectMarkdownExtractor
from kontextual_engine.core import (
AssetRepresentation,
Classification,
IngestionFailure,
IngestionIdentityPolicy,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
@@ -34,6 +39,7 @@ class AssetIngestionResult:
job: IngestionJob
asset: KnowledgeAsset | None = None
asset_change: AssetChangeResult | None = None
action: str = "failed"
class AssetIngestionService:
@@ -48,7 +54,15 @@ class AssetIngestionService:
self.repository = repository
self.asset_service = asset_service or AssetRegistryService(repository)
self.connectors = {connector.name: connector for connector in (connectors or [LocalFileConnector()])}
self.extractors = list(extractors or [PlainTextExtractor(), MarkitectMarkdownExtractor()])
self.extractors = list(
extractors
or [
PlainTextExtractor(),
CsvDatasetExtractor(),
DocumentPlaceholderExtractor(),
MarkitectMarkdownExtractor(),
]
)
def connector_capabilities(self) -> list[dict]:
return [connector.capabilities().to_dict() for connector in self.connectors.values()]
@@ -65,11 +79,20 @@ class AssetIngestionService:
title: str | None = None,
classification: Classification | None = None,
idempotency_key: str | None = None,
identity_policy: IngestionIdentityPolicy | str = IngestionIdentityPolicy.SOURCE_LOCATION,
skip_unchanged: bool = True,
) -> AssetIngestionResult:
identity_policy = IngestionIdentityPolicy(identity_policy)
self.repository.save_actor(context.actor)
connector = self._connector("local_file")
job = IngestionJob.create(
input={"connector": connector.name, "source_uri": str(path), "mode": "file"},
input={
"connector": connector.name,
"source_uri": str(path),
"mode": "file",
"identity_policy": identity_policy.value,
"skip_unchanged": skip_unchanged,
},
actor_id=context.actor.id,
correlation_id=context.correlation_id,
)
@@ -84,6 +107,8 @@ class AssetIngestionService:
title=title,
classification=classification,
idempotency_key=idempotency_key,
identity_policy=identity_policy,
skip_unchanged=skip_unchanged,
)
except Exception as exc:
failed = job.failed(_failure_from_exception(exc))
@@ -97,7 +122,10 @@ class AssetIngestionService:
*,
recursive: bool = True,
classification: Classification | None = None,
identity_policy: IngestionIdentityPolicy | str = IngestionIdentityPolicy.SOURCE_LOCATION,
skip_unchanged: bool = True,
) -> IngestionJob:
identity_policy = IngestionIdentityPolicy(identity_policy)
self.repository.save_actor(context.actor)
connector = self._directory_connector("local_file")
job = IngestionJob.create(
@@ -106,6 +134,8 @@ class AssetIngestionService:
"source_uri": str(path),
"mode": "directory",
"recursive": recursive,
"identity_policy": identity_policy.value,
"skip_unchanged": skip_unchanged,
},
actor_id=context.actor.id,
correlation_id=context.correlation_id,
@@ -118,11 +148,18 @@ class AssetIngestionService:
item_results: list[dict] = []
files = connector.iter_files(str(path), recursive=recursive)
for source_uri in files:
result = self.ingest_file(source_uri, context, classification=classification)
result = self.ingest_file(
source_uri,
context,
classification=classification,
identity_policy=identity_policy,
skip_unchanged=skip_unchanged,
)
item = {
"source_uri": source_uri,
"job_id": result.job.job_id,
"status": result.job.status.value,
"status": result.action if result.action == "skipped" else result.job.status.value,
"action": result.action,
}
if result.asset is not None:
output_asset_ids.append(result.asset.id)
@@ -130,6 +167,9 @@ class AssetIngestionService:
if result.job.failures:
failures.extend(result.job.failures)
item["failures"] = [failure.to_dict() for failure in result.job.failures]
item["retry_state"] = (
"retriable" if any(failure.retriable for failure in result.job.failures) else "not_retriable"
)
item_results.append(item)
partial_results = {
@@ -137,7 +177,7 @@ class AssetIngestionService:
"succeeded": sum(1 for item in item_results if item["status"] == IngestionJobStatus.COMPLETED.value),
"failed": sum(1 for item in item_results if item["status"] == IngestionJobStatus.FAILED.value),
"quarantined": sum(1 for item in item_results if item["status"] == IngestionJobStatus.QUARANTINED.value),
"skipped": 0,
"skipped": sum(1 for item in item_results if item["status"] == "skipped"),
"items": item_results,
}
if failures and output_asset_ids:
@@ -177,12 +217,31 @@ class AssetIngestionService:
title: str | None,
classification: Classification | None,
idempotency_key: str | None,
identity_policy: IngestionIdentityPolicy,
skip_unchanged: bool,
) -> AssetIngestionResult:
job = job.running(source_ref=payload.source_ref)
self.repository.save_ingestion_job(job)
extractor = self._extractor(payload.media_type)
extraction = extractor.extract(payload)
resolved_asset_id = asset_id or _stable_asset_id(payload)
resolved_asset_id = asset_id or _stable_asset_id(payload, identity_policy)
existing_asset = _get_asset_or_none(self.repository, resolved_asset_id)
if existing_asset and skip_unchanged and _asset_has_source_reference(existing_asset, payload.source_ref):
completed = job.completed(
output_asset_ids=(existing_asset.id,),
partial_results={
"action": "skipped",
"reason": "unchanged_source",
"asset_id": existing_asset.id,
"identity_policy": identity_policy.value,
"connector": payload.connector_name,
"extractor": extractor.name,
"source_digest": payload.content_digest,
"diagnostics": [diagnostic.to_dict() for diagnostic in extraction.diagnostics],
},
)
self.repository.save_ingestion_job(completed)
return AssetIngestionResult(completed, existing_asset, action="skipped")
source_representation = AssetRepresentation.from_content(
resolved_asset_id,
RepresentationKind.SOURCE,
@@ -211,19 +270,33 @@ class AssetIngestionService:
**extraction.metadata,
},
)
asset_change = self.asset_service.create_asset(
title or payload.title,
classification or Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
context,
asset_id=resolved_asset_id,
source_refs=[payload.source_ref],
representations=[source_representation, normalized_representation],
metadata_records=_metadata_records(payload, extractor.name, extraction.metadata),
idempotency_key=idempotency_key,
)
metadata_records = _metadata_records(payload, extractor.name, extraction.metadata)
if existing_asset:
asset_change = self.asset_service.record_ingestion_update(
resolved_asset_id,
payload.source_ref,
(source_representation, normalized_representation),
metadata_records,
context,
)
action = "updated"
else:
asset_change = self.asset_service.create_asset(
title or payload.title,
classification or Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
context,
asset_id=resolved_asset_id,
source_refs=[payload.source_ref],
representations=[source_representation, normalized_representation],
metadata_records=metadata_records,
idempotency_key=idempotency_key,
)
action = "created"
completed = job.completed(
output_asset_ids=(asset_change.asset.id,),
partial_results={
"action": action,
"identity_policy": identity_policy.value,
"connector": payload.connector_name,
"extractor": extractor.name,
"source_digest": payload.content_digest,
@@ -235,7 +308,7 @@ class AssetIngestionService:
},
)
self.repository.save_ingestion_job(completed)
return AssetIngestionResult(completed, asset_change.asset, asset_change)
return AssetIngestionResult(completed, asset_change.asset, asset_change, action=action)
def _connector(self, name: str) -> SourceConnector:
try:
@@ -262,19 +335,46 @@ class AssetIngestionService:
)
def _stable_asset_id(payload: SourcePayload) -> str:
digest = mapping_digest(
{
"source_system": payload.source_ref.source_system,
"path": payload.source_ref.path,
"uri": payload.source_ref.uri,
"external_id": payload.source_ref.external_id,
"connector_ref": payload.source_ref.connector_ref,
}
)
def _stable_asset_id(payload: SourcePayload, identity_policy: IngestionIdentityPolicy) -> str:
identity_data = {
"source_system": payload.source_ref.source_system,
}
if identity_policy == IngestionIdentityPolicy.CONTENT_DIGEST:
identity_data["checksum"] = payload.content_digest
else:
identity_data.update(
{
"path": payload.source_ref.path,
"uri": payload.source_ref.uri,
"external_id": payload.source_ref.external_id,
"connector_ref": payload.source_ref.connector_ref,
}
)
digest = mapping_digest(identity_data)
return f"asset-{digest.removeprefix('sha256:')[:20]}"
def _get_asset_or_none(repository: AssetRegistryRepository, asset_id: str) -> KnowledgeAsset | None:
try:
return repository.get_asset(asset_id)
except KontextualError as exc:
if exc.code == "kontextual.not_found":
return None
raise
def _asset_has_source_reference(asset: KnowledgeAsset, source_ref) -> bool:
return any(
existing.identity_key == source_ref.identity_key
or (
existing.connector_ref is not None
and existing.connector_ref == source_ref.connector_ref
and existing.checksum == source_ref.checksum
)
for existing in asset.source_refs
)
def _metadata_records(
payload: SourcePayload,
extractor_name: str,