from pathlib import Path import pytest from kontextual_engine import ( Actor, ActorType, AssetIngestionService, Classification, ConnectorCapability, ExtractionResult, ExtractorCapability, IngestionIdentityPolicy, IngestionJobStatus, InMemoryAssetRegistryRepository, LifecycleState, NormalizedDocument, OperationContext, RepresentationKind, Sensitivity, SourcePayload, SourceReference, SQLiteAssetRegistryRepository, content_digest, ) def test_asset_ingestion_service_ingests_plain_text_file_as_governed_asset(tmp_path: Path) -> None: source = tmp_path / "note.txt" source.write_text("hello\nworld\n", encoding="utf-8") repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo) result = service.ingest_file( source, operation_context(), asset_id="asset-note", classification=Classification(asset_type="note", sensitivity=Sensitivity.INTERNAL), ) assert result.job.status == IngestionJobStatus.COMPLETED assert result.job.correlation_id == "corr-ingest" assert result.job.output_asset_ids == ("asset-note",) assert result.asset is not None assert result.asset.source_refs[0].source_system == "local_file" assert result.asset.source_refs[0].path == str(source) assert repo.get_ingestion_job(result.job.job_id).status == IngestionJobStatus.COMPLETED assert {item.kind for item in repo.list_representations(asset_id="asset-note")} == { RepresentationKind.SOURCE, RepresentationKind.NORMALIZED, } normalized = repo.list_representations(asset_id="asset-note", kind=RepresentationKind.NORMALIZED)[0] assert normalized.media_type == "application/vnd.kontextual.normalized+json" assert normalized.metadata["extractor"] == "plain-text" assert normalized.metadata["line_count"] == 2 assert normalized.metadata["paragraph_count"] == 1 assert normalized.metadata["link_count"] == 0 assert repo.list_audit_events(target="asset:asset-note")[0].operation == "asset.create" def test_ingestion_failure_records_job_without_trusting_unsupported_asset(tmp_path: Path) -> None: source = tmp_path / "blob.bin" source.write_bytes(b"\x00\x01\x02") repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo) result = service.ingest_file(source, operation_context(), asset_id="asset-blob") assert result.asset is None assert result.job.status == IngestionJobStatus.FAILED assert result.job.failures[0].code == "kontextual.adapter_unavailable" assert result.job.failures[0].details["media_type"] == "application/octet-stream" assert repo.list_assets() == [] def test_directory_ingestion_reports_partial_results(tmp_path: Path) -> None: (tmp_path / "one.txt").write_text("one", encoding="utf-8") (tmp_path / "two.bin").write_bytes(b"\x00\x01") repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo) job = service.ingest_directory(tmp_path, operation_context(), recursive=False) assert job.status == IngestionJobStatus.PARTIALLY_COMPLETED assert job.partial_results["files_total"] == 2 assert job.partial_results["succeeded"] == 1 assert job.partial_results["failed"] == 1 assert len(job.output_asset_ids) == 1 assert len(job.failures) == 1 def test_ingestion_content_digest_identity_preserves_asset_across_file_move(tmp_path: Path) -> None: first_path = tmp_path / "original.txt" moved_path = tmp_path / "renamed.txt" first_path.write_text("same durable content\n", encoding="utf-8") repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo) context = operation_context() first = service.ingest_file( first_path, context, identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST, ) first_path.rename(moved_path) moved = service.ingest_file( moved_path, context, identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST, ) repeated = service.ingest_file( moved_path, context, identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST, ) assert first.asset is not None assert moved.asset is not None assert repeated.asset is not None assert first.action == "created" assert moved.action == "updated" assert repeated.action == "skipped" assert moved.asset.id == first.asset.id assert repeated.asset.id == first.asset.id assert len(repo.list_assets()) == 1 assert [source.path for source in repo.get_asset(first.asset.id).source_refs] == [ str(first_path), str(moved_path), ] assert repeated.job.partial_results["reason"] == "unchanged_source" assert [version.sequence for version in repo.list_versions(first.asset.id)] == [1, 2] assert [event.operation for event in repo.list_audit_events(target=f"asset:{first.asset.id}")] == [ "asset.create", "asset.ingest.update", ] def test_directory_ingestion_reports_skipped_and_retry_state(tmp_path: Path) -> None: already_seen = tmp_path / "seen.txt" unsupported = tmp_path / "unsupported.bin" already_seen.write_text("skip me on the directory pass", encoding="utf-8") unsupported.write_bytes(b"\x00\x01") repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo) context = operation_context() service.ingest_file(already_seen, context) job = service.ingest_directory(tmp_path, context, recursive=False) items = {Path(item["source_uri"]).name: item for item in job.partial_results["items"]} assert job.status == IngestionJobStatus.PARTIALLY_COMPLETED assert job.partial_results["succeeded"] == 0 assert job.partial_results["skipped"] == 1 assert job.partial_results["failed"] == 1 assert items["seen.txt"]["status"] == "skipped" assert items["seen.txt"]["action"] == "skipped" assert items["unsupported.bin"]["status"] == IngestionJobStatus.FAILED.value assert items["unsupported.bin"]["retry_state"] == "retriable" assert items["unsupported.bin"]["failures"][0]["code"] == "kontextual.adapter_unavailable" def test_asset_ingestion_service_ingests_csv_dataset_with_structured_table(tmp_path: Path) -> None: source = tmp_path / "metrics.csv" source.write_text("name,score\nalpha,0.82\nbeta,0.91\n", encoding="utf-8") repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo) result = service.ingest_file( source, operation_context(), asset_id="asset-metrics", classification=Classification(asset_type="dataset", sensitivity=Sensitivity.INTERNAL), ) normalized = repo.list_representations(asset_id="asset-metrics", kind=RepresentationKind.NORMALIZED)[0] assert result.job.status == IngestionJobStatus.COMPLETED assert result.job.partial_results["extractor"] == "csv-dataset" assert normalized.metadata["dataset_format"] == "csv" assert normalized.metadata["columns"] == ["name", "score"] assert normalized.metadata["row_count"] == 2 assert normalized.metadata["table_count"] == 1 assert [record.value for record in repo.list_metadata_records("asset-metrics") if record.key == "extractor"] == [ "csv-dataset" ] @pytest.mark.parametrize( ("filename", "content", "media_type", "document_kind"), [ ("source.pdf", b"%PDF-1.7\n", "application/pdf", "pdf"), ( "source.docx", b"PK\x03\x04docx-placeholder", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "office_document", ), ], ) def test_document_placeholder_formats_create_asset_with_unsupported_depth_diagnostic( tmp_path: Path, filename: str, content: bytes, media_type: str, document_kind: str, ) -> None: source = tmp_path / filename source.write_bytes(content) repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo) result = service.ingest_file( source, operation_context(), asset_id=f"asset-{source.stem}", classification=Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL), ) normalized = repo.list_representations(asset_id=f"asset-{source.stem}", kind=RepresentationKind.NORMALIZED)[0] assert result.job.status == IngestionJobStatus.COMPLETED assert result.asset is not None assert result.job.partial_results["diagnostics"][0]["code"] == "extraction.depth_unsupported" assert result.job.partial_results["diagnostics"][0]["details"]["media_type"] == media_type assert normalized.producer == "document-placeholder" assert normalized.metadata["document_kind"] == document_kind assert normalized.metadata["extraction_depth"] == "metadata_only" assert normalized.metadata["unsupported_elements"][0]["reason"] == "deep_extraction_not_available" def test_ingestion_quarantines_empty_normalized_output_without_asset(tmp_path: Path) -> None: source = tmp_path / "emptyish.txt" source.write_text("content that the bad extractor drops", encoding="utf-8") repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo, extractors=[EmptyTextExtractor()]) result = service.ingest_file(source, operation_context(), asset_id="asset-emptyish") assert result.asset is None assert result.action == "quarantined" assert result.job.status == IngestionJobStatus.QUARANTINED assert result.job.failures[0].code == "ingestion.normalized_empty" assert result.job.partial_results["action"] == "quarantined" assert repo.list_assets() == [] def test_ingestion_preserves_source_permission_context_on_representations() -> None: repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo, connectors=[PermissionedConnector()]) result = service.ingest_file("permissioned.txt", operation_context(), asset_id="asset-permissioned") source = repo.list_representations(asset_id="asset-permissioned", kind=RepresentationKind.SOURCE)[0] normalized = repo.list_representations(asset_id="asset-permissioned", kind=RepresentationKind.NORMALIZED)[0] permission_records = [ record for record in repo.list_metadata_records("asset-permissioned") if record.key == "source_permission_context" ] assert result.job.status == IngestionJobStatus.COMPLETED assert source.metadata["permission_context"] == { "ingest_allowed": True, "labels": ["engineering"], } assert normalized.metadata["permission_context"]["labels"] == ["engineering"] assert permission_records[0].value["ingest_allowed"] is True def test_ingestion_quarantines_permission_denied_source_without_asset() -> None: repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo, connectors=[PermissionedConnector(ingest_allowed=False)]) result = service.ingest_file("denied.txt", operation_context(), asset_id="asset-denied") assert result.asset is None assert result.job.status == IngestionJobStatus.QUARANTINED assert result.job.failures[0].code == "ingestion.permission_denied" assert repo.list_assets() == [] def test_directory_ingestion_reports_quarantined_failed_and_retriable_counts(tmp_path: Path) -> None: (tmp_path / "bad-normalized.txt").write_text("dropped", encoding="utf-8") (tmp_path / "unsupported.bin").write_bytes(b"\x00\x01") repo = InMemoryAssetRegistryRepository() service = AssetIngestionService(repo, extractors=[EmptyTextExtractor()]) job = service.ingest_directory(tmp_path, operation_context(), recursive=False) items = {Path(item["source_uri"]).name: item for item in job.partial_results["items"]} assert job.status == IngestionJobStatus.FAILED assert job.partial_results["succeeded"] == 0 assert job.partial_results["failed"] == 1 assert job.partial_results["quarantined"] == 1 assert job.partial_results["retriable"] == 1 assert items["bad-normalized.txt"]["status"] == IngestionJobStatus.QUARANTINED.value assert items["bad-normalized.txt"]["failures"][0]["code"] == "ingestion.normalized_empty" assert items["unsupported.bin"]["retry_state"] == "retriable" def test_sqlite_ingestion_jobs_survive_reinstantiation(tmp_path: Path) -> None: source = tmp_path / "policy.txt" source.write_text("governed ingestion", encoding="utf-8") db_path = tmp_path / "registry.sqlite" repo = SQLiteAssetRegistryRepository(db_path) service = AssetIngestionService(repo) result = service.ingest_file( source, operation_context(), asset_id="asset-policy", ) reloaded = SQLiteAssetRegistryRepository(db_path) job = reloaded.get_ingestion_job(result.job.job_id) assert job.status == IngestionJobStatus.COMPLETED assert job.output_asset_ids == ("asset-policy",) assert reloaded.get_asset("asset-policy").lifecycle == LifecycleState.ACTIVE assert len(reloaded.list_representations(asset_id="asset-policy")) == 2 def operation_context() -> OperationContext: actor = Actor.create( ActorType.HUMAN, actor_id="user-ingest", display_name="Ingestion Tester", groups=["engineering"], ) return OperationContext.create(actor, correlation_id="corr-ingest") class EmptyTextExtractor: name = "empty-text" def capabilities(self) -> ExtractorCapability: return ExtractorCapability( extractor_name=self.name, media_types=("text/plain",), extraction_depth="text", ) def supports(self, media_type: str) -> bool: return media_type == "text/plain" def extract(self, payload: SourcePayload) -> ExtractionResult: return ExtractionResult( normalized=NormalizedDocument(text="", confidence=1.0), metadata={ "extractor": self.name, "source_digest": payload.content_digest, "source_size_bytes": payload.size_bytes, }, ) class PermissionedConnector: name = "local_file" def __init__(self, *, ingest_allowed: bool = True) -> None: self.ingest_allowed = ingest_allowed def capabilities(self) -> ConnectorCapability: return ConnectorCapability( connector_name=self.name, source_types=("file",), supports_directories=False, ) def fetch(self, source_uri: str) -> SourcePayload: content = b"permissioned content" return SourcePayload( connector_name=self.name, source_uri=source_uri, source_ref=SourceReference( source_system=self.name, path=source_uri, checksum=content_digest(content), connector_ref=f"{self.name}:{source_uri}", ), media_type="text/plain", content=content, title=Path(source_uri).stem, metadata={"filename": Path(source_uri).name}, permission_context={ "ingest_allowed": self.ingest_allowed, "labels": ["engineering"], }, )