kontextual-engine/tests/test_asset_ingestion_service.py

from pathlib import Path

import pytest

from kontextual_engine import (
    Actor,
    ActorType,
    AssetIngestionService,
    Classification,
    IngestionIdentityPolicy,
    IngestionJobStatus,
    InMemoryAssetRegistryRepository,
    LifecycleState,
    OperationContext,
    RepresentationKind,
    Sensitivity,
    SQLiteAssetRegistryRepository,
)


def test_asset_ingestion_service_ingests_plain_text_file_as_governed_asset(tmp_path: Path) -> None:
    source = tmp_path / "note.txt"
    source.write_text("hello\nworld\n", encoding="utf-8")
    repo = InMemoryAssetRegistryRepository()
    service = AssetIngestionService(repo)

    result = service.ingest_file(
        source,
        operation_context(),
        asset_id="asset-note",
        classification=Classification(asset_type="note", sensitivity=Sensitivity.INTERNAL),
    )

    assert result.job.status == IngestionJobStatus.COMPLETED
    assert result.job.correlation_id == "corr-ingest"
    assert result.job.output_asset_ids == ("asset-note",)
    assert result.asset is not None
    assert result.asset.source_refs[0].source_system == "local_file"
    assert result.asset.source_refs[0].path == str(source)
    assert repo.get_ingestion_job(result.job.job_id).status == IngestionJobStatus.COMPLETED
    assert {item.kind for item in repo.list_representations(asset_id="asset-note")} == {
        RepresentationKind.SOURCE,
        RepresentationKind.NORMALIZED,
    }
    normalized = repo.list_representations(asset_id="asset-note", kind=RepresentationKind.NORMALIZED)[0]
    assert normalized.media_type == "application/vnd.kontextual.normalized+json"
    assert normalized.metadata["extractor"] == "plain-text"
    assert repo.list_audit_events(target="asset:asset-note")[0].operation == "asset.create"


def test_ingestion_failure_records_job_without_trusting_unsupported_asset(tmp_path: Path) -> None:
    source = tmp_path / "blob.bin"
    source.write_bytes(b"\x00\x01\x02")
    repo = InMemoryAssetRegistryRepository()
    service = AssetIngestionService(repo)

    result = service.ingest_file(source, operation_context(), asset_id="asset-blob")

    assert result.asset is None
    assert result.job.status == IngestionJobStatus.FAILED
    assert result.job.failures[0].code == "kontextual.adapter_unavailable"
    assert result.job.failures[0].details["media_type"] == "application/octet-stream"
    assert repo.list_assets() == []


def test_directory_ingestion_reports_partial_results(tmp_path: Path) -> None:
    (tmp_path / "one.txt").write_text("one", encoding="utf-8")
    (tmp_path / "two.bin").write_bytes(b"\x00\x01")
    repo = InMemoryAssetRegistryRepository()
    service = AssetIngestionService(repo)

    job = service.ingest_directory(tmp_path, operation_context(), recursive=False)

    assert job.status == IngestionJobStatus.PARTIALLY_COMPLETED
    assert job.partial_results["files_total"] == 2
    assert job.partial_results["succeeded"] == 1
    assert job.partial_results["failed"] == 1
    assert len(job.output_asset_ids) == 1
    assert len(job.failures) == 1


def test_ingestion_content_digest_identity_preserves_asset_across_file_move(tmp_path: Path) -> None:
    first_path = tmp_path / "original.txt"
    moved_path = tmp_path / "renamed.txt"
    first_path.write_text("same durable content\n", encoding="utf-8")
    repo = InMemoryAssetRegistryRepository()
    service = AssetIngestionService(repo)
    context = operation_context()

    first = service.ingest_file(
        first_path,
        context,
        identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
    )
    first_path.rename(moved_path)
    moved = service.ingest_file(
        moved_path,
        context,
        identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
    )
    repeated = service.ingest_file(
        moved_path,
        context,
        identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
    )

    assert first.asset is not None
    assert moved.asset is not None
    assert repeated.asset is not None
    assert first.action == "created"
    assert moved.action == "updated"
    assert repeated.action == "skipped"
    assert moved.asset.id == first.asset.id
    assert repeated.asset.id == first.asset.id
    assert len(repo.list_assets()) == 1
    assert [source.path for source in repo.get_asset(first.asset.id).source_refs] == [
        str(first_path),
        str(moved_path),
    ]
    assert repeated.job.partial_results["reason"] == "unchanged_source"
    assert [version.sequence for version in repo.list_versions(first.asset.id)] == [1, 2]
    assert [event.operation for event in repo.list_audit_events(target=f"asset:{first.asset.id}")] == [
        "asset.create",
        "asset.ingest.update",
    ]


def test_directory_ingestion_reports_skipped_and_retry_state(tmp_path: Path) -> None:
    already_seen = tmp_path / "seen.txt"
    unsupported = tmp_path / "unsupported.bin"
    already_seen.write_text("skip me on the directory pass", encoding="utf-8")
    unsupported.write_bytes(b"\x00\x01")
    repo = InMemoryAssetRegistryRepository()
    service = AssetIngestionService(repo)
    context = operation_context()

    service.ingest_file(already_seen, context)
    job = service.ingest_directory(tmp_path, context, recursive=False)

    items = {Path(item["source_uri"]).name: item for item in job.partial_results["items"]}

    assert job.status == IngestionJobStatus.PARTIALLY_COMPLETED
    assert job.partial_results["succeeded"] == 0
    assert job.partial_results["skipped"] == 1
    assert job.partial_results["failed"] == 1
    assert items["seen.txt"]["status"] == "skipped"
    assert items["seen.txt"]["action"] == "skipped"
    assert items["unsupported.bin"]["status"] == IngestionJobStatus.FAILED.value
    assert items["unsupported.bin"]["retry_state"] == "retriable"
    assert items["unsupported.bin"]["failures"][0]["code"] == "kontextual.adapter_unavailable"


def test_asset_ingestion_service_ingests_csv_dataset_with_structured_table(tmp_path: Path) -> None:
    source = tmp_path / "metrics.csv"
    source.write_text("name,score\nalpha,0.82\nbeta,0.91\n", encoding="utf-8")
    repo = InMemoryAssetRegistryRepository()
    service = AssetIngestionService(repo)

    result = service.ingest_file(
        source,
        operation_context(),
        asset_id="asset-metrics",
        classification=Classification(asset_type="dataset", sensitivity=Sensitivity.INTERNAL),
    )

    normalized = repo.list_representations(asset_id="asset-metrics", kind=RepresentationKind.NORMALIZED)[0]

    assert result.job.status == IngestionJobStatus.COMPLETED
    assert result.job.partial_results["extractor"] == "csv-dataset"
    assert normalized.metadata["dataset_format"] == "csv"
    assert normalized.metadata["columns"] == ["name", "score"]
    assert normalized.metadata["row_count"] == 2
    assert normalized.metadata["table_count"] == 1
    assert [record.value for record in repo.list_metadata_records("asset-metrics") if record.key == "extractor"] == [
        "csv-dataset"
    ]


@pytest.mark.parametrize(
    ("filename", "content", "media_type", "document_kind"),
    [
        ("source.pdf", b"%PDF-1.7\n", "application/pdf", "pdf"),
        (
            "source.docx",
            b"PK\x03\x04docx-placeholder",
            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
            "office_document",
        ),
    ],
)
def test_document_placeholder_formats_create_asset_with_unsupported_depth_diagnostic(
    tmp_path: Path,
    filename: str,
    content: bytes,
    media_type: str,
    document_kind: str,
) -> None:
    source = tmp_path / filename
    source.write_bytes(content)
    repo = InMemoryAssetRegistryRepository()
    service = AssetIngestionService(repo)

    result = service.ingest_file(
        source,
        operation_context(),
        asset_id=f"asset-{source.stem}",
        classification=Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
    )

    normalized = repo.list_representations(asset_id=f"asset-{source.stem}", kind=RepresentationKind.NORMALIZED)[0]

    assert result.job.status == IngestionJobStatus.COMPLETED
    assert result.asset is not None
    assert result.job.partial_results["diagnostics"][0]["code"] == "extraction.depth_unsupported"
    assert result.job.partial_results["diagnostics"][0]["details"]["media_type"] == media_type
    assert normalized.producer == "document-placeholder"
    assert normalized.metadata["document_kind"] == document_kind
    assert normalized.metadata["extraction_depth"] == "metadata_only"
    assert normalized.metadata["unsupported_elements"][0]["reason"] == "deep_extraction_not_available"


def test_sqlite_ingestion_jobs_survive_reinstantiation(tmp_path: Path) -> None:
    source = tmp_path / "policy.txt"
    source.write_text("governed ingestion", encoding="utf-8")
    db_path = tmp_path / "registry.sqlite"
    repo = SQLiteAssetRegistryRepository(db_path)
    service = AssetIngestionService(repo)

    result = service.ingest_file(
        source,
        operation_context(),
        asset_id="asset-policy",
    )

    reloaded = SQLiteAssetRegistryRepository(db_path)
    job = reloaded.get_ingestion_job(result.job.job_id)

    assert job.status == IngestionJobStatus.COMPLETED
    assert job.output_asset_ids == ("asset-policy",)
    assert reloaded.get_asset("asset-policy").lifecycle == LifecycleState.ACTIVE
    assert len(reloaded.list_representations(asset_id="asset-policy")) == 2


def operation_context() -> OperationContext:
    actor = Actor.create(
        ActorType.HUMAN,
        actor_id="user-ingest",
        display_name="Ingestion Tester",
        groups=["engineering"],
    )
    return OperationContext.create(actor, correlation_id="corr-ingest")