generated from coulomb/repo-seed
390 lines
15 KiB
Python
390 lines
15 KiB
Python
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from kontextual_engine import (
|
|
Actor,
|
|
ActorType,
|
|
AssetIngestionService,
|
|
Classification,
|
|
ConnectorCapability,
|
|
ExtractionResult,
|
|
ExtractorCapability,
|
|
IngestionIdentityPolicy,
|
|
IngestionJobStatus,
|
|
InMemoryAssetRegistryRepository,
|
|
LifecycleState,
|
|
NormalizedDocument,
|
|
OperationContext,
|
|
RepresentationKind,
|
|
Sensitivity,
|
|
SourcePayload,
|
|
SourceReference,
|
|
SQLiteAssetRegistryRepository,
|
|
content_digest,
|
|
)
|
|
|
|
|
|
def test_asset_ingestion_service_ingests_plain_text_file_as_governed_asset(tmp_path: Path) -> None:
|
|
source = tmp_path / "note.txt"
|
|
source.write_text("hello\nworld\n", encoding="utf-8")
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo)
|
|
|
|
result = service.ingest_file(
|
|
source,
|
|
operation_context(),
|
|
asset_id="asset-note",
|
|
classification=Classification(asset_type="note", sensitivity=Sensitivity.INTERNAL),
|
|
)
|
|
|
|
assert result.job.status == IngestionJobStatus.COMPLETED
|
|
assert result.job.correlation_id == "corr-ingest"
|
|
assert result.job.output_asset_ids == ("asset-note",)
|
|
assert result.asset is not None
|
|
assert result.asset.source_refs[0].source_system == "local_file"
|
|
assert result.asset.source_refs[0].path == str(source)
|
|
assert repo.get_ingestion_job(result.job.job_id).status == IngestionJobStatus.COMPLETED
|
|
assert {item.kind for item in repo.list_representations(asset_id="asset-note")} == {
|
|
RepresentationKind.SOURCE,
|
|
RepresentationKind.NORMALIZED,
|
|
}
|
|
normalized = repo.list_representations(asset_id="asset-note", kind=RepresentationKind.NORMALIZED)[0]
|
|
assert normalized.media_type == "application/vnd.kontextual.normalized+json"
|
|
assert normalized.metadata["extractor"] == "plain-text"
|
|
assert normalized.metadata["line_count"] == 2
|
|
assert normalized.metadata["paragraph_count"] == 1
|
|
assert normalized.metadata["link_count"] == 0
|
|
assert repo.list_audit_events(target="asset:asset-note")[0].operation == "asset.create"
|
|
|
|
|
|
def test_ingestion_failure_records_job_without_trusting_unsupported_asset(tmp_path: Path) -> None:
|
|
source = tmp_path / "blob.bin"
|
|
source.write_bytes(b"\x00\x01\x02")
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo)
|
|
|
|
result = service.ingest_file(source, operation_context(), asset_id="asset-blob")
|
|
|
|
assert result.asset is None
|
|
assert result.job.status == IngestionJobStatus.FAILED
|
|
assert result.job.failures[0].code == "kontextual.adapter_unavailable"
|
|
assert result.job.failures[0].details["media_type"] == "application/octet-stream"
|
|
assert repo.list_assets() == []
|
|
|
|
|
|
def test_directory_ingestion_reports_partial_results(tmp_path: Path) -> None:
|
|
(tmp_path / "one.txt").write_text("one", encoding="utf-8")
|
|
(tmp_path / "two.bin").write_bytes(b"\x00\x01")
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo)
|
|
|
|
job = service.ingest_directory(tmp_path, operation_context(), recursive=False)
|
|
|
|
assert job.status == IngestionJobStatus.PARTIALLY_COMPLETED
|
|
assert job.partial_results["files_total"] == 2
|
|
assert job.partial_results["succeeded"] == 1
|
|
assert job.partial_results["failed"] == 1
|
|
assert len(job.output_asset_ids) == 1
|
|
assert len(job.failures) == 1
|
|
|
|
|
|
def test_ingestion_content_digest_identity_preserves_asset_across_file_move(tmp_path: Path) -> None:
|
|
first_path = tmp_path / "original.txt"
|
|
moved_path = tmp_path / "renamed.txt"
|
|
first_path.write_text("same durable content\n", encoding="utf-8")
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo)
|
|
context = operation_context()
|
|
|
|
first = service.ingest_file(
|
|
first_path,
|
|
context,
|
|
identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
|
|
)
|
|
first_path.rename(moved_path)
|
|
moved = service.ingest_file(
|
|
moved_path,
|
|
context,
|
|
identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
|
|
)
|
|
repeated = service.ingest_file(
|
|
moved_path,
|
|
context,
|
|
identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
|
|
)
|
|
|
|
assert first.asset is not None
|
|
assert moved.asset is not None
|
|
assert repeated.asset is not None
|
|
assert first.action == "created"
|
|
assert moved.action == "updated"
|
|
assert repeated.action == "skipped"
|
|
assert moved.asset.id == first.asset.id
|
|
assert repeated.asset.id == first.asset.id
|
|
assert len(repo.list_assets()) == 1
|
|
assert [source.path for source in repo.get_asset(first.asset.id).source_refs] == [
|
|
str(first_path),
|
|
str(moved_path),
|
|
]
|
|
assert repeated.job.partial_results["reason"] == "unchanged_source"
|
|
assert [version.sequence for version in repo.list_versions(first.asset.id)] == [1, 2]
|
|
assert [event.operation for event in repo.list_audit_events(target=f"asset:{first.asset.id}")] == [
|
|
"asset.create",
|
|
"asset.ingest.update",
|
|
]
|
|
|
|
|
|
def test_directory_ingestion_reports_skipped_and_retry_state(tmp_path: Path) -> None:
|
|
already_seen = tmp_path / "seen.txt"
|
|
unsupported = tmp_path / "unsupported.bin"
|
|
already_seen.write_text("skip me on the directory pass", encoding="utf-8")
|
|
unsupported.write_bytes(b"\x00\x01")
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo)
|
|
context = operation_context()
|
|
|
|
service.ingest_file(already_seen, context)
|
|
job = service.ingest_directory(tmp_path, context, recursive=False)
|
|
|
|
items = {Path(item["source_uri"]).name: item for item in job.partial_results["items"]}
|
|
|
|
assert job.status == IngestionJobStatus.PARTIALLY_COMPLETED
|
|
assert job.partial_results["succeeded"] == 0
|
|
assert job.partial_results["skipped"] == 1
|
|
assert job.partial_results["failed"] == 1
|
|
assert items["seen.txt"]["status"] == "skipped"
|
|
assert items["seen.txt"]["action"] == "skipped"
|
|
assert items["unsupported.bin"]["status"] == IngestionJobStatus.FAILED.value
|
|
assert items["unsupported.bin"]["retry_state"] == "retriable"
|
|
assert items["unsupported.bin"]["failures"][0]["code"] == "kontextual.adapter_unavailable"
|
|
|
|
|
|
def test_asset_ingestion_service_ingests_csv_dataset_with_structured_table(tmp_path: Path) -> None:
|
|
source = tmp_path / "metrics.csv"
|
|
source.write_text("name,score\nalpha,0.82\nbeta,0.91\n", encoding="utf-8")
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo)
|
|
|
|
result = service.ingest_file(
|
|
source,
|
|
operation_context(),
|
|
asset_id="asset-metrics",
|
|
classification=Classification(asset_type="dataset", sensitivity=Sensitivity.INTERNAL),
|
|
)
|
|
|
|
normalized = repo.list_representations(asset_id="asset-metrics", kind=RepresentationKind.NORMALIZED)[0]
|
|
|
|
assert result.job.status == IngestionJobStatus.COMPLETED
|
|
assert result.job.partial_results["extractor"] == "csv-dataset"
|
|
assert normalized.metadata["dataset_format"] == "csv"
|
|
assert normalized.metadata["columns"] == ["name", "score"]
|
|
assert normalized.metadata["row_count"] == 2
|
|
assert normalized.metadata["table_count"] == 1
|
|
assert [record.value for record in repo.list_metadata_records("asset-metrics") if record.key == "extractor"] == [
|
|
"csv-dataset"
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
("filename", "content", "media_type", "document_kind"),
|
|
[
|
|
("source.pdf", b"%PDF-1.7\n", "application/pdf", "pdf"),
|
|
(
|
|
"source.docx",
|
|
b"PK\x03\x04docx-placeholder",
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"office_document",
|
|
),
|
|
],
|
|
)
|
|
def test_document_placeholder_formats_create_asset_with_unsupported_depth_diagnostic(
|
|
tmp_path: Path,
|
|
filename: str,
|
|
content: bytes,
|
|
media_type: str,
|
|
document_kind: str,
|
|
) -> None:
|
|
source = tmp_path / filename
|
|
source.write_bytes(content)
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo)
|
|
|
|
result = service.ingest_file(
|
|
source,
|
|
operation_context(),
|
|
asset_id=f"asset-{source.stem}",
|
|
classification=Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
|
|
)
|
|
|
|
normalized = repo.list_representations(asset_id=f"asset-{source.stem}", kind=RepresentationKind.NORMALIZED)[0]
|
|
|
|
assert result.job.status == IngestionJobStatus.COMPLETED
|
|
assert result.asset is not None
|
|
assert result.job.partial_results["diagnostics"][0]["code"] == "extraction.depth_unsupported"
|
|
assert result.job.partial_results["diagnostics"][0]["details"]["media_type"] == media_type
|
|
assert normalized.producer == "document-placeholder"
|
|
assert normalized.metadata["document_kind"] == document_kind
|
|
assert normalized.metadata["extraction_depth"] == "metadata_only"
|
|
assert normalized.metadata["unsupported_elements"][0]["reason"] == "deep_extraction_not_available"
|
|
|
|
|
|
def test_ingestion_quarantines_empty_normalized_output_without_asset(tmp_path: Path) -> None:
|
|
source = tmp_path / "emptyish.txt"
|
|
source.write_text("content that the bad extractor drops", encoding="utf-8")
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo, extractors=[EmptyTextExtractor()])
|
|
|
|
result = service.ingest_file(source, operation_context(), asset_id="asset-emptyish")
|
|
|
|
assert result.asset is None
|
|
assert result.action == "quarantined"
|
|
assert result.job.status == IngestionJobStatus.QUARANTINED
|
|
assert result.job.failures[0].code == "ingestion.normalized_empty"
|
|
assert result.job.partial_results["action"] == "quarantined"
|
|
assert repo.list_assets() == []
|
|
|
|
|
|
def test_ingestion_preserves_source_permission_context_on_representations() -> None:
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo, connectors=[PermissionedConnector()])
|
|
|
|
result = service.ingest_file("permissioned.txt", operation_context(), asset_id="asset-permissioned")
|
|
|
|
source = repo.list_representations(asset_id="asset-permissioned", kind=RepresentationKind.SOURCE)[0]
|
|
normalized = repo.list_representations(asset_id="asset-permissioned", kind=RepresentationKind.NORMALIZED)[0]
|
|
permission_records = [
|
|
record for record in repo.list_metadata_records("asset-permissioned")
|
|
if record.key == "source_permission_context"
|
|
]
|
|
|
|
assert result.job.status == IngestionJobStatus.COMPLETED
|
|
assert source.metadata["permission_context"] == {
|
|
"ingest_allowed": True,
|
|
"labels": ["engineering"],
|
|
}
|
|
assert normalized.metadata["permission_context"]["labels"] == ["engineering"]
|
|
assert permission_records[0].value["ingest_allowed"] is True
|
|
|
|
|
|
def test_ingestion_quarantines_permission_denied_source_without_asset() -> None:
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo, connectors=[PermissionedConnector(ingest_allowed=False)])
|
|
|
|
result = service.ingest_file("denied.txt", operation_context(), asset_id="asset-denied")
|
|
|
|
assert result.asset is None
|
|
assert result.job.status == IngestionJobStatus.QUARANTINED
|
|
assert result.job.failures[0].code == "ingestion.permission_denied"
|
|
assert repo.list_assets() == []
|
|
|
|
|
|
def test_directory_ingestion_reports_quarantined_failed_and_retriable_counts(tmp_path: Path) -> None:
|
|
(tmp_path / "bad-normalized.txt").write_text("dropped", encoding="utf-8")
|
|
(tmp_path / "unsupported.bin").write_bytes(b"\x00\x01")
|
|
repo = InMemoryAssetRegistryRepository()
|
|
service = AssetIngestionService(repo, extractors=[EmptyTextExtractor()])
|
|
|
|
job = service.ingest_directory(tmp_path, operation_context(), recursive=False)
|
|
items = {Path(item["source_uri"]).name: item for item in job.partial_results["items"]}
|
|
|
|
assert job.status == IngestionJobStatus.FAILED
|
|
assert job.partial_results["succeeded"] == 0
|
|
assert job.partial_results["failed"] == 1
|
|
assert job.partial_results["quarantined"] == 1
|
|
assert job.partial_results["retriable"] == 1
|
|
assert items["bad-normalized.txt"]["status"] == IngestionJobStatus.QUARANTINED.value
|
|
assert items["bad-normalized.txt"]["failures"][0]["code"] == "ingestion.normalized_empty"
|
|
assert items["unsupported.bin"]["retry_state"] == "retriable"
|
|
|
|
|
|
def test_sqlite_ingestion_jobs_survive_reinstantiation(tmp_path: Path) -> None:
|
|
source = tmp_path / "policy.txt"
|
|
source.write_text("governed ingestion", encoding="utf-8")
|
|
db_path = tmp_path / "registry.sqlite"
|
|
repo = SQLiteAssetRegistryRepository(db_path)
|
|
service = AssetIngestionService(repo)
|
|
|
|
result = service.ingest_file(
|
|
source,
|
|
operation_context(),
|
|
asset_id="asset-policy",
|
|
)
|
|
|
|
reloaded = SQLiteAssetRegistryRepository(db_path)
|
|
job = reloaded.get_ingestion_job(result.job.job_id)
|
|
|
|
assert job.status == IngestionJobStatus.COMPLETED
|
|
assert job.output_asset_ids == ("asset-policy",)
|
|
assert reloaded.get_asset("asset-policy").lifecycle == LifecycleState.ACTIVE
|
|
assert len(reloaded.list_representations(asset_id="asset-policy")) == 2
|
|
|
|
|
|
def operation_context() -> OperationContext:
|
|
actor = Actor.create(
|
|
ActorType.HUMAN,
|
|
actor_id="user-ingest",
|
|
display_name="Ingestion Tester",
|
|
groups=["engineering"],
|
|
)
|
|
return OperationContext.create(actor, correlation_id="corr-ingest")
|
|
|
|
|
|
class EmptyTextExtractor:
|
|
name = "empty-text"
|
|
|
|
def capabilities(self) -> ExtractorCapability:
|
|
return ExtractorCapability(
|
|
extractor_name=self.name,
|
|
media_types=("text/plain",),
|
|
extraction_depth="text",
|
|
)
|
|
|
|
def supports(self, media_type: str) -> bool:
|
|
return media_type == "text/plain"
|
|
|
|
def extract(self, payload: SourcePayload) -> ExtractionResult:
|
|
return ExtractionResult(
|
|
normalized=NormalizedDocument(text="", confidence=1.0),
|
|
metadata={
|
|
"extractor": self.name,
|
|
"source_digest": payload.content_digest,
|
|
"source_size_bytes": payload.size_bytes,
|
|
},
|
|
)
|
|
|
|
|
|
class PermissionedConnector:
|
|
name = "local_file"
|
|
|
|
def __init__(self, *, ingest_allowed: bool = True) -> None:
|
|
self.ingest_allowed = ingest_allowed
|
|
|
|
def capabilities(self) -> ConnectorCapability:
|
|
return ConnectorCapability(
|
|
connector_name=self.name,
|
|
source_types=("file",),
|
|
supports_directories=False,
|
|
)
|
|
|
|
def fetch(self, source_uri: str) -> SourcePayload:
|
|
content = b"permissioned content"
|
|
return SourcePayload(
|
|
connector_name=self.name,
|
|
source_uri=source_uri,
|
|
source_ref=SourceReference(
|
|
source_system=self.name,
|
|
path=source_uri,
|
|
checksum=content_digest(content),
|
|
connector_ref=f"{self.name}:{source_uri}",
|
|
),
|
|
media_type="text/plain",
|
|
content=content,
|
|
title=Path(source_uri).stem,
|
|
metadata={"filename": Path(source_uri).name},
|
|
permission_context={
|
|
"ingest_allowed": self.ingest_allowed,
|
|
"labels": ["engineering"],
|
|
},
|
|
)
|