generated from coulomb/repo-seed
default source-location identity and opt-in content-digest identity for file move/rename reconciliation, PDF/DOCX-style placeholder ingestion
This commit is contained in:
@@ -345,6 +345,79 @@ class AssetRegistryService:
|
||||
def list_metadata_schema_assignments(self) -> list[MetadataSchemaAssignment]:
|
||||
return self.repository.list_metadata_schema_assignments()
|
||||
|
||||
def record_ingestion_update(
|
||||
self,
|
||||
asset_id: str,
|
||||
source_ref: SourceReference,
|
||||
representations: list[AssetRepresentation] | tuple[AssetRepresentation, ...],
|
||||
metadata_records: list[MetadataRecord] | tuple[MetadataRecord, ...],
|
||||
context: OperationContext,
|
||||
*,
|
||||
expected_current_version_id: str | None = None,
|
||||
) -> AssetChangeResult:
|
||||
asset = self.repository.get_asset(asset_id)
|
||||
self._assert_expected_current_version(
|
||||
asset,
|
||||
expected_current_version_id,
|
||||
operation="asset.ingest.update",
|
||||
)
|
||||
decision = self._authorize(
|
||||
context,
|
||||
"asset.ingest.update",
|
||||
f"asset:{asset.id}",
|
||||
resource_metadata={
|
||||
"source_system": source_ref.source_system,
|
||||
"source_path": source_ref.path or "",
|
||||
"checksum": source_ref.checksum or "",
|
||||
"representation_count": str(len(representations)),
|
||||
"metadata_record_count": str(len(metadata_records)),
|
||||
},
|
||||
)
|
||||
self._validate_metadata_records(
|
||||
asset.classification,
|
||||
self.repository.list_metadata_records(asset.id) + list(metadata_records),
|
||||
)
|
||||
updated = asset
|
||||
if not _has_source_reference(updated, source_ref):
|
||||
updated = updated.with_source_reference(source_ref)
|
||||
alias = _source_alias(source_ref)
|
||||
if alias:
|
||||
updated = updated.with_alias(alias)
|
||||
representation_ids: list[str] = []
|
||||
for representation in representations:
|
||||
if representation.asset_id != asset.id:
|
||||
representation = replace(representation, asset_id=asset.id)
|
||||
self.repository.save_representation(representation)
|
||||
representation_ids.append(representation.representation_id)
|
||||
for record in metadata_records:
|
||||
self.repository.save_metadata_record(asset.id, record)
|
||||
version = AssetVersion(
|
||||
asset_id=asset.id,
|
||||
sequence=self._next_sequence(asset.id),
|
||||
change_type=VersionChangeType.CONTENT_CHANGED,
|
||||
representation_ids=tuple(representation_ids),
|
||||
actor_id=context.actor.id,
|
||||
parent_version_id=asset.current_version_id,
|
||||
metadata_delta={record.key: record.value for record in metadata_records},
|
||||
lifecycle=updated.lifecycle.value,
|
||||
)
|
||||
updated = updated.with_current_version(version.version_id)
|
||||
self.repository.save_asset(updated)
|
||||
self.repository.save_version(version)
|
||||
event = self._audit(
|
||||
"asset.ingest.update",
|
||||
f"asset:{asset.id}",
|
||||
AuditOutcome.SUCCESS,
|
||||
context,
|
||||
decision,
|
||||
details={
|
||||
"source_ref_id": source_ref.id,
|
||||
"version_id": version.version_id,
|
||||
"representation_ids": tuple(representation_ids),
|
||||
},
|
||||
)
|
||||
return AssetChangeResult(updated, version, event, decision)
|
||||
|
||||
def add_representation(
|
||||
self,
|
||||
asset_id: str,
|
||||
@@ -881,3 +954,19 @@ def _remediation_for_error(error: KontextualError) -> str | None:
|
||||
if isinstance(error, AuthorizationError):
|
||||
return "Request policy approval or rerun with an actor that is authorized for this operation."
|
||||
return None
|
||||
|
||||
|
||||
def _has_source_reference(asset: KnowledgeAsset, source_ref: SourceReference) -> bool:
|
||||
return any(
|
||||
existing.identity_key == source_ref.identity_key
|
||||
or (
|
||||
existing.connector_ref is not None
|
||||
and existing.connector_ref == source_ref.connector_ref
|
||||
and existing.checksum == source_ref.checksum
|
||||
)
|
||||
for existing in asset.source_refs
|
||||
)
|
||||
|
||||
|
||||
def _source_alias(source_ref: SourceReference) -> str | None:
|
||||
return source_ref.connector_ref or source_ref.path or source_ref.uri or source_ref.external_id
|
||||
|
||||
@@ -6,13 +6,18 @@ from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from kontextual_engine.adapters.builtin_extractors import PlainTextExtractor
|
||||
from kontextual_engine.adapters.builtin_extractors import (
|
||||
CsvDatasetExtractor,
|
||||
DocumentPlaceholderExtractor,
|
||||
PlainTextExtractor,
|
||||
)
|
||||
from kontextual_engine.adapters.local_files import LocalFileConnector
|
||||
from kontextual_engine.adapters.markitect_tool import MarkitectMarkdownExtractor
|
||||
from kontextual_engine.core import (
|
||||
AssetRepresentation,
|
||||
Classification,
|
||||
IngestionFailure,
|
||||
IngestionIdentityPolicy,
|
||||
IngestionJob,
|
||||
IngestionJobStatus,
|
||||
KnowledgeAsset,
|
||||
@@ -34,6 +39,7 @@ class AssetIngestionResult:
|
||||
job: IngestionJob
|
||||
asset: KnowledgeAsset | None = None
|
||||
asset_change: AssetChangeResult | None = None
|
||||
action: str = "failed"
|
||||
|
||||
|
||||
class AssetIngestionService:
|
||||
@@ -48,7 +54,15 @@ class AssetIngestionService:
|
||||
self.repository = repository
|
||||
self.asset_service = asset_service or AssetRegistryService(repository)
|
||||
self.connectors = {connector.name: connector for connector in (connectors or [LocalFileConnector()])}
|
||||
self.extractors = list(extractors or [PlainTextExtractor(), MarkitectMarkdownExtractor()])
|
||||
self.extractors = list(
|
||||
extractors
|
||||
or [
|
||||
PlainTextExtractor(),
|
||||
CsvDatasetExtractor(),
|
||||
DocumentPlaceholderExtractor(),
|
||||
MarkitectMarkdownExtractor(),
|
||||
]
|
||||
)
|
||||
|
||||
def connector_capabilities(self) -> list[dict]:
|
||||
return [connector.capabilities().to_dict() for connector in self.connectors.values()]
|
||||
@@ -65,11 +79,20 @@ class AssetIngestionService:
|
||||
title: str | None = None,
|
||||
classification: Classification | None = None,
|
||||
idempotency_key: str | None = None,
|
||||
identity_policy: IngestionIdentityPolicy | str = IngestionIdentityPolicy.SOURCE_LOCATION,
|
||||
skip_unchanged: bool = True,
|
||||
) -> AssetIngestionResult:
|
||||
identity_policy = IngestionIdentityPolicy(identity_policy)
|
||||
self.repository.save_actor(context.actor)
|
||||
connector = self._connector("local_file")
|
||||
job = IngestionJob.create(
|
||||
input={"connector": connector.name, "source_uri": str(path), "mode": "file"},
|
||||
input={
|
||||
"connector": connector.name,
|
||||
"source_uri": str(path),
|
||||
"mode": "file",
|
||||
"identity_policy": identity_policy.value,
|
||||
"skip_unchanged": skip_unchanged,
|
||||
},
|
||||
actor_id=context.actor.id,
|
||||
correlation_id=context.correlation_id,
|
||||
)
|
||||
@@ -84,6 +107,8 @@ class AssetIngestionService:
|
||||
title=title,
|
||||
classification=classification,
|
||||
idempotency_key=idempotency_key,
|
||||
identity_policy=identity_policy,
|
||||
skip_unchanged=skip_unchanged,
|
||||
)
|
||||
except Exception as exc:
|
||||
failed = job.failed(_failure_from_exception(exc))
|
||||
@@ -97,7 +122,10 @@ class AssetIngestionService:
|
||||
*,
|
||||
recursive: bool = True,
|
||||
classification: Classification | None = None,
|
||||
identity_policy: IngestionIdentityPolicy | str = IngestionIdentityPolicy.SOURCE_LOCATION,
|
||||
skip_unchanged: bool = True,
|
||||
) -> IngestionJob:
|
||||
identity_policy = IngestionIdentityPolicy(identity_policy)
|
||||
self.repository.save_actor(context.actor)
|
||||
connector = self._directory_connector("local_file")
|
||||
job = IngestionJob.create(
|
||||
@@ -106,6 +134,8 @@ class AssetIngestionService:
|
||||
"source_uri": str(path),
|
||||
"mode": "directory",
|
||||
"recursive": recursive,
|
||||
"identity_policy": identity_policy.value,
|
||||
"skip_unchanged": skip_unchanged,
|
||||
},
|
||||
actor_id=context.actor.id,
|
||||
correlation_id=context.correlation_id,
|
||||
@@ -118,11 +148,18 @@ class AssetIngestionService:
|
||||
item_results: list[dict] = []
|
||||
files = connector.iter_files(str(path), recursive=recursive)
|
||||
for source_uri in files:
|
||||
result = self.ingest_file(source_uri, context, classification=classification)
|
||||
result = self.ingest_file(
|
||||
source_uri,
|
||||
context,
|
||||
classification=classification,
|
||||
identity_policy=identity_policy,
|
||||
skip_unchanged=skip_unchanged,
|
||||
)
|
||||
item = {
|
||||
"source_uri": source_uri,
|
||||
"job_id": result.job.job_id,
|
||||
"status": result.job.status.value,
|
||||
"status": result.action if result.action == "skipped" else result.job.status.value,
|
||||
"action": result.action,
|
||||
}
|
||||
if result.asset is not None:
|
||||
output_asset_ids.append(result.asset.id)
|
||||
@@ -130,6 +167,9 @@ class AssetIngestionService:
|
||||
if result.job.failures:
|
||||
failures.extend(result.job.failures)
|
||||
item["failures"] = [failure.to_dict() for failure in result.job.failures]
|
||||
item["retry_state"] = (
|
||||
"retriable" if any(failure.retriable for failure in result.job.failures) else "not_retriable"
|
||||
)
|
||||
item_results.append(item)
|
||||
|
||||
partial_results = {
|
||||
@@ -137,7 +177,7 @@ class AssetIngestionService:
|
||||
"succeeded": sum(1 for item in item_results if item["status"] == IngestionJobStatus.COMPLETED.value),
|
||||
"failed": sum(1 for item in item_results if item["status"] == IngestionJobStatus.FAILED.value),
|
||||
"quarantined": sum(1 for item in item_results if item["status"] == IngestionJobStatus.QUARANTINED.value),
|
||||
"skipped": 0,
|
||||
"skipped": sum(1 for item in item_results if item["status"] == "skipped"),
|
||||
"items": item_results,
|
||||
}
|
||||
if failures and output_asset_ids:
|
||||
@@ -177,12 +217,31 @@ class AssetIngestionService:
|
||||
title: str | None,
|
||||
classification: Classification | None,
|
||||
idempotency_key: str | None,
|
||||
identity_policy: IngestionIdentityPolicy,
|
||||
skip_unchanged: bool,
|
||||
) -> AssetIngestionResult:
|
||||
job = job.running(source_ref=payload.source_ref)
|
||||
self.repository.save_ingestion_job(job)
|
||||
extractor = self._extractor(payload.media_type)
|
||||
extraction = extractor.extract(payload)
|
||||
resolved_asset_id = asset_id or _stable_asset_id(payload)
|
||||
resolved_asset_id = asset_id or _stable_asset_id(payload, identity_policy)
|
||||
existing_asset = _get_asset_or_none(self.repository, resolved_asset_id)
|
||||
if existing_asset and skip_unchanged and _asset_has_source_reference(existing_asset, payload.source_ref):
|
||||
completed = job.completed(
|
||||
output_asset_ids=(existing_asset.id,),
|
||||
partial_results={
|
||||
"action": "skipped",
|
||||
"reason": "unchanged_source",
|
||||
"asset_id": existing_asset.id,
|
||||
"identity_policy": identity_policy.value,
|
||||
"connector": payload.connector_name,
|
||||
"extractor": extractor.name,
|
||||
"source_digest": payload.content_digest,
|
||||
"diagnostics": [diagnostic.to_dict() for diagnostic in extraction.diagnostics],
|
||||
},
|
||||
)
|
||||
self.repository.save_ingestion_job(completed)
|
||||
return AssetIngestionResult(completed, existing_asset, action="skipped")
|
||||
source_representation = AssetRepresentation.from_content(
|
||||
resolved_asset_id,
|
||||
RepresentationKind.SOURCE,
|
||||
@@ -211,19 +270,33 @@ class AssetIngestionService:
|
||||
**extraction.metadata,
|
||||
},
|
||||
)
|
||||
asset_change = self.asset_service.create_asset(
|
||||
title or payload.title,
|
||||
classification or Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
||||
context,
|
||||
asset_id=resolved_asset_id,
|
||||
source_refs=[payload.source_ref],
|
||||
representations=[source_representation, normalized_representation],
|
||||
metadata_records=_metadata_records(payload, extractor.name, extraction.metadata),
|
||||
idempotency_key=idempotency_key,
|
||||
)
|
||||
metadata_records = _metadata_records(payload, extractor.name, extraction.metadata)
|
||||
if existing_asset:
|
||||
asset_change = self.asset_service.record_ingestion_update(
|
||||
resolved_asset_id,
|
||||
payload.source_ref,
|
||||
(source_representation, normalized_representation),
|
||||
metadata_records,
|
||||
context,
|
||||
)
|
||||
action = "updated"
|
||||
else:
|
||||
asset_change = self.asset_service.create_asset(
|
||||
title or payload.title,
|
||||
classification or Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
||||
context,
|
||||
asset_id=resolved_asset_id,
|
||||
source_refs=[payload.source_ref],
|
||||
representations=[source_representation, normalized_representation],
|
||||
metadata_records=metadata_records,
|
||||
idempotency_key=idempotency_key,
|
||||
)
|
||||
action = "created"
|
||||
completed = job.completed(
|
||||
output_asset_ids=(asset_change.asset.id,),
|
||||
partial_results={
|
||||
"action": action,
|
||||
"identity_policy": identity_policy.value,
|
||||
"connector": payload.connector_name,
|
||||
"extractor": extractor.name,
|
||||
"source_digest": payload.content_digest,
|
||||
@@ -235,7 +308,7 @@ class AssetIngestionService:
|
||||
},
|
||||
)
|
||||
self.repository.save_ingestion_job(completed)
|
||||
return AssetIngestionResult(completed, asset_change.asset, asset_change)
|
||||
return AssetIngestionResult(completed, asset_change.asset, asset_change, action=action)
|
||||
|
||||
def _connector(self, name: str) -> SourceConnector:
|
||||
try:
|
||||
@@ -262,19 +335,46 @@ class AssetIngestionService:
|
||||
)
|
||||
|
||||
|
||||
def _stable_asset_id(payload: SourcePayload) -> str:
|
||||
digest = mapping_digest(
|
||||
{
|
||||
"source_system": payload.source_ref.source_system,
|
||||
"path": payload.source_ref.path,
|
||||
"uri": payload.source_ref.uri,
|
||||
"external_id": payload.source_ref.external_id,
|
||||
"connector_ref": payload.source_ref.connector_ref,
|
||||
}
|
||||
)
|
||||
def _stable_asset_id(payload: SourcePayload, identity_policy: IngestionIdentityPolicy) -> str:
|
||||
identity_data = {
|
||||
"source_system": payload.source_ref.source_system,
|
||||
}
|
||||
if identity_policy == IngestionIdentityPolicy.CONTENT_DIGEST:
|
||||
identity_data["checksum"] = payload.content_digest
|
||||
else:
|
||||
identity_data.update(
|
||||
{
|
||||
"path": payload.source_ref.path,
|
||||
"uri": payload.source_ref.uri,
|
||||
"external_id": payload.source_ref.external_id,
|
||||
"connector_ref": payload.source_ref.connector_ref,
|
||||
}
|
||||
)
|
||||
digest = mapping_digest(identity_data)
|
||||
return f"asset-{digest.removeprefix('sha256:')[:20]}"
|
||||
|
||||
|
||||
def _get_asset_or_none(repository: AssetRegistryRepository, asset_id: str) -> KnowledgeAsset | None:
|
||||
try:
|
||||
return repository.get_asset(asset_id)
|
||||
except KontextualError as exc:
|
||||
if exc.code == "kontextual.not_found":
|
||||
return None
|
||||
raise
|
||||
|
||||
|
||||
def _asset_has_source_reference(asset: KnowledgeAsset, source_ref) -> bool:
|
||||
return any(
|
||||
existing.identity_key == source_ref.identity_key
|
||||
or (
|
||||
existing.connector_ref is not None
|
||||
and existing.connector_ref == source_ref.connector_ref
|
||||
and existing.checksum == source_ref.checksum
|
||||
)
|
||||
for existing in asset.source_refs
|
||||
)
|
||||
|
||||
|
||||
def _metadata_records(
|
||||
payload: SourcePayload,
|
||||
extractor_name: str,
|
||||
|
||||
Reference in New Issue
Block a user