default source-location identity and opt-in content-digest identity for file move/rename reconciliation, PDF/DOCX-style placeholder ingestion

2026-05-06 13:04:36 +02:00
parent 48dffedc09
commit a4a4759ac4
13 changed files with 724 additions and 39 deletions
--- a/tests/test_asset_ingestion_service.py
+++ b/tests/test_asset_ingestion_service.py
@@ -1,10 +1,13 @@
 from pathlib import Path

+import pytest
+
 from kontextual_engine import (
    Actor,
    ActorType,
    AssetIngestionService,
    Classification,
+    IngestionIdentityPolicy,
    IngestionJobStatus,
    InMemoryAssetRegistryRepository,
    LifecycleState,
@@ -76,6 +79,146 @@ def test_directory_ingestion_reports_partial_results(tmp_path: Path) -> None:
    assert len(job.failures) == 1


+def test_ingestion_content_digest_identity_preserves_asset_across_file_move(tmp_path: Path) -> None:
+    first_path = tmp_path / "original.txt"
+    moved_path = tmp_path / "renamed.txt"
+    first_path.write_text("same durable content\n", encoding="utf-8")
+    repo = InMemoryAssetRegistryRepository()
+    service = AssetIngestionService(repo)
+    context = operation_context()
+
+    first = service.ingest_file(
+        first_path,
+        context,
+        identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
+    )
+    first_path.rename(moved_path)
+    moved = service.ingest_file(
+        moved_path,
+        context,
+        identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
+    )
+    repeated = service.ingest_file(
+        moved_path,
+        context,
+        identity_policy=IngestionIdentityPolicy.CONTENT_DIGEST,
+    )
+
+    assert first.asset is not None
+    assert moved.asset is not None
+    assert repeated.asset is not None
+    assert first.action == "created"
+    assert moved.action == "updated"
+    assert repeated.action == "skipped"
+    assert moved.asset.id == first.asset.id
+    assert repeated.asset.id == first.asset.id
+    assert len(repo.list_assets()) == 1
+    assert [source.path for source in repo.get_asset(first.asset.id).source_refs] == [
+        str(first_path),
+        str(moved_path),
+    ]
+    assert repeated.job.partial_results["reason"] == "unchanged_source"
+    assert [version.sequence for version in repo.list_versions(first.asset.id)] == [1, 2]
+    assert [event.operation for event in repo.list_audit_events(target=f"asset:{first.asset.id}")] == [
+        "asset.create",
+        "asset.ingest.update",
+    ]
+
+
+def test_directory_ingestion_reports_skipped_and_retry_state(tmp_path: Path) -> None:
+    already_seen = tmp_path / "seen.txt"
+    unsupported = tmp_path / "unsupported.bin"
+    already_seen.write_text("skip me on the directory pass", encoding="utf-8")
+    unsupported.write_bytes(b"\x00\x01")
+    repo = InMemoryAssetRegistryRepository()
+    service = AssetIngestionService(repo)
+    context = operation_context()
+
+    service.ingest_file(already_seen, context)
+    job = service.ingest_directory(tmp_path, context, recursive=False)
+
+    items = {Path(item["source_uri"]).name: item for item in job.partial_results["items"]}
+
+    assert job.status == IngestionJobStatus.PARTIALLY_COMPLETED
+    assert job.partial_results["succeeded"] == 0
+    assert job.partial_results["skipped"] == 1
+    assert job.partial_results["failed"] == 1
+    assert items["seen.txt"]["status"] == "skipped"
+    assert items["seen.txt"]["action"] == "skipped"
+    assert items["unsupported.bin"]["status"] == IngestionJobStatus.FAILED.value
+    assert items["unsupported.bin"]["retry_state"] == "retriable"
+    assert items["unsupported.bin"]["failures"][0]["code"] == "kontextual.adapter_unavailable"
+
+
+def test_asset_ingestion_service_ingests_csv_dataset_with_structured_table(tmp_path: Path) -> None:
+    source = tmp_path / "metrics.csv"
+    source.write_text("name,score\nalpha,0.82\nbeta,0.91\n", encoding="utf-8")
+    repo = InMemoryAssetRegistryRepository()
+    service = AssetIngestionService(repo)
+
+    result = service.ingest_file(
+        source,
+        operation_context(),
+        asset_id="asset-metrics",
+        classification=Classification(asset_type="dataset", sensitivity=Sensitivity.INTERNAL),
+    )
+
+    normalized = repo.list_representations(asset_id="asset-metrics", kind=RepresentationKind.NORMALIZED)[0]
+
+    assert result.job.status == IngestionJobStatus.COMPLETED
+    assert result.job.partial_results["extractor"] == "csv-dataset"
+    assert normalized.metadata["dataset_format"] == "csv"
+    assert normalized.metadata["columns"] == ["name", "score"]
+    assert normalized.metadata["row_count"] == 2
+    assert normalized.metadata["table_count"] == 1
+    assert [record.value for record in repo.list_metadata_records("asset-metrics") if record.key == "extractor"] == [
+        "csv-dataset"
+    ]
+
+
+@pytest.mark.parametrize(
+    ("filename", "content", "media_type", "document_kind"),
+    [
+        ("source.pdf", b"%PDF-1.7\n", "application/pdf", "pdf"),
+        (
+            "source.docx",
+            b"PK\x03\x04docx-placeholder",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+            "office_document",
+        ),
+    ],
+)
+def test_document_placeholder_formats_create_asset_with_unsupported_depth_diagnostic(
+    tmp_path: Path,
+    filename: str,
+    content: bytes,
+    media_type: str,
+    document_kind: str,
+) -> None:
+    source = tmp_path / filename
+    source.write_bytes(content)
+    repo = InMemoryAssetRegistryRepository()
+    service = AssetIngestionService(repo)
+
+    result = service.ingest_file(
+        source,
+        operation_context(),
+        asset_id=f"asset-{source.stem}",
+        classification=Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
+    )
+
+    normalized = repo.list_representations(asset_id=f"asset-{source.stem}", kind=RepresentationKind.NORMALIZED)[0]
+
+    assert result.job.status == IngestionJobStatus.COMPLETED
+    assert result.asset is not None
+    assert result.job.partial_results["diagnostics"][0]["code"] == "extraction.depth_unsupported"
+    assert result.job.partial_results["diagnostics"][0]["details"]["media_type"] == media_type
+    assert normalized.producer == "document-placeholder"
+    assert normalized.metadata["document_kind"] == document_kind
+    assert normalized.metadata["extraction_depth"] == "metadata_only"
+    assert normalized.metadata["unsupported_elements"][0]["reason"] == "deep_extraction_not_available"
+
+
 def test_sqlite_ingestion_jobs_survive_reinstantiation(tmp_path: Path) -> None:
    source = tmp_path / "policy.txt"
    source.write_text("governed ingestion", encoding="utf-8")
--- a/tests/test_markitect_ingestion_adapter.py
+++ b/tests/test_markitect_ingestion_adapter.py
@@ -0,0 +1,97 @@
+import sys
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+
+from kontextual_engine import SourcePayload, SourceReference, content_digest
+from kontextual_engine.adapters.markitect_tool import MarkitectMarkdownExtractor
+from kontextual_engine.errors import AdapterUnavailableError
+
+
+def test_markitect_markdown_extractor_missing_dependency_is_structured(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setitem(sys.modules, "markitect_tool", None)
+    extractor = MarkitectMarkdownExtractor()
+    payload = markdown_payload("# Missing Adapter\n")
+
+    with pytest.raises(AdapterUnavailableError) as exc_info:
+        extractor.extract(payload)
+
+    assert exc_info.value.details == {
+        "adapter": "markitect-tool",
+        "media_type": "text/markdown",
+    }
+
+
+def test_markitect_markdown_extractor_delegates_to_markitect_tool(
+    monkeypatch: pytest.MonkeyPatch,
+    tmp_path: Path,
+) -> None:
+    source = tmp_path / "decision.md"
+    source.write_text("# Decision\n\nUse Markitect.\n", encoding="utf-8")
+    calls: list[tuple[str, str]] = []
+
+    def parse_markdown_file(path: Path) -> SimpleNamespace:
+        calls.append(("parse_markdown_file", str(path)))
+        return SimpleNamespace(
+            to_dict=lambda: {
+                "frontmatter": {"status": "accepted"},
+                "headings": [{"level": 1, "text": "Decision", "line": 1}],
+                "sections": [
+                    {
+                        "heading": {"level": 1, "text": "Decision", "line": 1},
+                        "blocks": [{"type": "paragraph", "text": "Use Markitect.", "line_start": 3}],
+                    }
+                ],
+            }
+        )
+
+    def snapshot_identity_for_file(path: Path, *, parse_options: dict) -> SimpleNamespace:
+        calls.append(("snapshot_identity_for_file", f"{path}:{parse_options['profile']}"))
+        return SimpleNamespace(
+            to_dict=lambda: {
+                "snapshot_id": "snapshot:decision",
+                "content_hash": "sha256:decision",
+                "parser": "markdown-it-py/commonmark",
+            }
+        )
+
+    monkeypatch.setitem(
+        sys.modules,
+        "markitect_tool",
+        SimpleNamespace(
+            parse_markdown_file=parse_markdown_file,
+            parse_markdown=lambda text, source_path=None: None,
+            snapshot_identity_for_file=snapshot_identity_for_file,
+        ),
+    )
+
+    result = MarkitectMarkdownExtractor().extract(markdown_payload(source.read_text(encoding="utf-8"), source))
+
+    assert calls == [
+        ("parse_markdown_file", str(source)),
+        ("snapshot_identity_for_file", f"{source}:default"),
+    ]
+    assert result.normalized.structure["frontmatter"] == {"status": "accepted"}
+    assert result.normalized.fields["heading_count"] == 1
+    assert result.normalized.fields["section_count"] == 1
+    assert result.metadata["snapshot"]["snapshot_id"] == "snapshot:decision"
+    assert result.normalized.extractor_metadata["snapshot"]["parser"] == "markdown-it-py/commonmark"
+
+
+def markdown_payload(markdown: str, path: Path | None = None) -> SourcePayload:
+    data = markdown.encode("utf-8")
+    source_ref = SourceReference(
+        source_system="local_file",
+        path=str(path) if path else None,
+        checksum=content_digest(data),
+        connector_ref=f"local_file:{path}" if path else None,
+    )
+    return SourcePayload(
+        connector_name="local_file",
+        source_uri=str(path) if path else "memory://markdown",
+        source_ref=source_ref,
+        media_type="text/markdown",
+        content=data,
+        title=path.stem if path else "Markdown",
+    )