richer normalized structure, permission context preservation

2026-05-06 13:43:16 +02:00
parent a4a4759ac4
commit 24cb3c5b6a
10 changed files with 636 additions and 15 deletions
--- a/tests/test_asset_ingestion_service.py
+++ b/tests/test_asset_ingestion_service.py
@@ -7,14 +7,21 @@ from kontextual_engine import (
    ActorType,
    AssetIngestionService,
    Classification,
+    ConnectorCapability,
+    ExtractionResult,
+    ExtractorCapability,
    IngestionIdentityPolicy,
    IngestionJobStatus,
    InMemoryAssetRegistryRepository,
    LifecycleState,
+    NormalizedDocument,
    OperationContext,
    RepresentationKind,
    Sensitivity,
+    SourcePayload,
+    SourceReference,
    SQLiteAssetRegistryRepository,
+    content_digest,
 )


@@ -45,6 +52,9 @@ def test_asset_ingestion_service_ingests_plain_text_file_as_governed_asset(tmp_p
    normalized = repo.list_representations(asset_id="asset-note", kind=RepresentationKind.NORMALIZED)[0]
    assert normalized.media_type == "application/vnd.kontextual.normalized+json"
    assert normalized.metadata["extractor"] == "plain-text"
+    assert normalized.metadata["line_count"] == 2
+    assert normalized.metadata["paragraph_count"] == 1
+    assert normalized.metadata["link_count"] == 0
    assert repo.list_audit_events(target="asset:asset-note")[0].operation == "asset.create"


@@ -219,6 +229,75 @@ def test_document_placeholder_formats_create_asset_with_unsupported_depth_diagno
    assert normalized.metadata["unsupported_elements"][0]["reason"] == "deep_extraction_not_available"


+def test_ingestion_quarantines_empty_normalized_output_without_asset(tmp_path: Path) -> None:
+    source = tmp_path / "emptyish.txt"
+    source.write_text("content that the bad extractor drops", encoding="utf-8")
+    repo = InMemoryAssetRegistryRepository()
+    service = AssetIngestionService(repo, extractors=[EmptyTextExtractor()])
+
+    result = service.ingest_file(source, operation_context(), asset_id="asset-emptyish")
+
+    assert result.asset is None
+    assert result.action == "quarantined"
+    assert result.job.status == IngestionJobStatus.QUARANTINED
+    assert result.job.failures[0].code == "ingestion.normalized_empty"
+    assert result.job.partial_results["action"] == "quarantined"
+    assert repo.list_assets() == []
+
+
+def test_ingestion_preserves_source_permission_context_on_representations() -> None:
+    repo = InMemoryAssetRegistryRepository()
+    service = AssetIngestionService(repo, connectors=[PermissionedConnector()])
+
+    result = service.ingest_file("permissioned.txt", operation_context(), asset_id="asset-permissioned")
+
+    source = repo.list_representations(asset_id="asset-permissioned", kind=RepresentationKind.SOURCE)[0]
+    normalized = repo.list_representations(asset_id="asset-permissioned", kind=RepresentationKind.NORMALIZED)[0]
+    permission_records = [
+        record for record in repo.list_metadata_records("asset-permissioned")
+        if record.key == "source_permission_context"
+    ]
+
+    assert result.job.status == IngestionJobStatus.COMPLETED
+    assert source.metadata["permission_context"] == {
+        "ingest_allowed": True,
+        "labels": ["engineering"],
+    }
+    assert normalized.metadata["permission_context"]["labels"] == ["engineering"]
+    assert permission_records[0].value["ingest_allowed"] is True
+
+
+def test_ingestion_quarantines_permission_denied_source_without_asset() -> None:
+    repo = InMemoryAssetRegistryRepository()
+    service = AssetIngestionService(repo, connectors=[PermissionedConnector(ingest_allowed=False)])
+
+    result = service.ingest_file("denied.txt", operation_context(), asset_id="asset-denied")
+
+    assert result.asset is None
+    assert result.job.status == IngestionJobStatus.QUARANTINED
+    assert result.job.failures[0].code == "ingestion.permission_denied"
+    assert repo.list_assets() == []
+
+
+def test_directory_ingestion_reports_quarantined_failed_and_retriable_counts(tmp_path: Path) -> None:
+    (tmp_path / "bad-normalized.txt").write_text("dropped", encoding="utf-8")
+    (tmp_path / "unsupported.bin").write_bytes(b"\x00\x01")
+    repo = InMemoryAssetRegistryRepository()
+    service = AssetIngestionService(repo, extractors=[EmptyTextExtractor()])
+
+    job = service.ingest_directory(tmp_path, operation_context(), recursive=False)
+    items = {Path(item["source_uri"]).name: item for item in job.partial_results["items"]}
+
+    assert job.status == IngestionJobStatus.FAILED
+    assert job.partial_results["succeeded"] == 0
+    assert job.partial_results["failed"] == 1
+    assert job.partial_results["quarantined"] == 1
+    assert job.partial_results["retriable"] == 1
+    assert items["bad-normalized.txt"]["status"] == IngestionJobStatus.QUARANTINED.value
+    assert items["bad-normalized.txt"]["failures"][0]["code"] == "ingestion.normalized_empty"
+    assert items["unsupported.bin"]["retry_state"] == "retriable"
+
+
 def test_sqlite_ingestion_jobs_survive_reinstantiation(tmp_path: Path) -> None:
    source = tmp_path / "policy.txt"
    source.write_text("governed ingestion", encoding="utf-8")
@@ -249,3 +328,62 @@ def operation_context() -> OperationContext:
        groups=["engineering"],
    )
    return OperationContext.create(actor, correlation_id="corr-ingest")
+
+
+class EmptyTextExtractor:
+    name = "empty-text"
+
+    def capabilities(self) -> ExtractorCapability:
+        return ExtractorCapability(
+            extractor_name=self.name,
+            media_types=("text/plain",),
+            extraction_depth="text",
+        )
+
+    def supports(self, media_type: str) -> bool:
+        return media_type == "text/plain"
+
+    def extract(self, payload: SourcePayload) -> ExtractionResult:
+        return ExtractionResult(
+            normalized=NormalizedDocument(text="", confidence=1.0),
+            metadata={
+                "extractor": self.name,
+                "source_digest": payload.content_digest,
+                "source_size_bytes": payload.size_bytes,
+            },
+        )
+
+
+class PermissionedConnector:
+    name = "local_file"
+
+    def __init__(self, *, ingest_allowed: bool = True) -> None:
+        self.ingest_allowed = ingest_allowed
+
+    def capabilities(self) -> ConnectorCapability:
+        return ConnectorCapability(
+            connector_name=self.name,
+            source_types=("file",),
+            supports_directories=False,
+        )
+
+    def fetch(self, source_uri: str) -> SourcePayload:
+        content = b"permissioned content"
+        return SourcePayload(
+            connector_name=self.name,
+            source_uri=source_uri,
+            source_ref=SourceReference(
+                source_system=self.name,
+                path=source_uri,
+                checksum=content_digest(content),
+                connector_ref=f"{self.name}:{source_uri}",
+            ),
+            media_type="text/plain",
+            content=content,
+            title=Path(source_uri).stem,
+            metadata={"filename": Path(source_uri).name},
+            permission_context={
+                "ingest_allowed": self.ingest_allowed,
+                "labels": ["engineering"],
+            },
+        )
--- a/tests/test_markitect_ingestion_adapter.py
+++ b/tests/test_markitect_ingestion_adapter.py
@@ -36,6 +36,11 @@ def test_markitect_markdown_extractor_delegates_to_markitect_tool(
        return SimpleNamespace(
            to_dict=lambda: {
                "frontmatter": {"status": "accepted"},
+                "blocks": [
+                    {"type": "heading", "text": "Decision", "line_start": 1, "heading_level": 1},
+                    {"type": "paragraph", "text": "Use Markitect.", "line_start": 3},
+                    {"type": "table", "text": "| A |\n| - |", "line_start": 5, "line_end": 6},
+                ],
                "headings": [{"level": 1, "text": "Decision", "line": 1}],
                "sections": [
                    {
@@ -43,6 +48,17 @@ def test_markitect_markdown_extractor_delegates_to_markitect_tool(
                        "blocks": [{"type": "paragraph", "text": "Use Markitect.", "line_start": 3}],
                    }
                ],
+                "tokens": [
+                    {
+                        "type": "inline",
+                        "children": [
+                            {
+                                "type": "link_open",
+                                "attrs": {"href": "https://example.test/decision"},
+                            }
+                        ],
+                    }
+                ],
            }
        )

@@ -73,8 +89,16 @@ def test_markitect_markdown_extractor_delegates_to_markitect_tool(
        ("snapshot_identity_for_file", f"{source}:default"),
    ]
    assert result.normalized.structure["frontmatter"] == {"status": "accepted"}
+    assert result.normalized.structure["blocks"][1]["type"] == "paragraph"
+    assert result.normalized.links == [
+        {"url": "https://example.test/decision", "kind": "markdown_link"}
+    ]
+    assert result.normalized.tables[0]["text"] == "| A |\n| - |"
+    assert result.normalized.fields["block_count"] == 3
    assert result.normalized.fields["heading_count"] == 1
    assert result.normalized.fields["section_count"] == 1
+    assert result.normalized.fields["link_count"] == 1
+    assert result.normalized.fields["table_count"] == 1
    assert result.metadata["snapshot"]["snapshot_id"] == "snapshot:decision"
    assert result.normalized.extractor_metadata["snapshot"]["parser"] == "markdown-it-py/commonmark"

--- a/tests/test_normalized_structure.py
+++ b/tests/test_normalized_structure.py
@@ -0,0 +1,91 @@
+from kontextual_engine import SourcePayload, SourceReference, content_digest
+from kontextual_engine.adapters.builtin_extractors import (
+    CsvDatasetExtractor,
+    DocumentPlaceholderExtractor,
+    PlainTextExtractor,
+)
+
+
+def test_plain_text_extractor_emits_structural_units_and_links() -> None:
+    payload = source_payload(
+        "Intro line\nwith https://example.test/ref\n\nSecond paragraph\n",
+        media_type="text/plain",
+    )
+
+    result = PlainTextExtractor().extract(payload)
+
+    assert result.normalized.structure["kind"] == "plain_text"
+    assert result.normalized.fields["line_count"] == 4
+    assert result.normalized.fields["paragraph_count"] == 2
+    assert result.normalized.links == [
+        {
+            "url": "https://example.test/ref",
+            "line": 2,
+            "start": 5,
+            "end": 29,
+        }
+    ]
+    assert result.metadata["link_count"] == 1
+
+
+def test_csv_dataset_extractor_emits_table_schema_samples_and_links() -> None:
+    payload = source_payload(
+        "name,source\nalpha,https://example.test/a\nbeta,\n",
+        media_type="text/csv",
+        filename="metrics.csv",
+    )
+
+    result = CsvDatasetExtractor().extract(payload)
+
+    assert result.normalized.structure["kind"] == "dataset"
+    assert result.normalized.structure["columns"] == [
+        {"name": "name", "index": 0},
+        {"name": "source", "index": 1},
+    ]
+    assert result.normalized.tables[0]["row_count"] == 2
+    assert result.normalized.tables[0]["rows"][0]["name"] == "alpha"
+    assert result.normalized.links == [
+        {
+            "url": "https://example.test/a",
+            "table": 0,
+            "row": 0,
+            "column": "source",
+        }
+    ]
+    assert result.metadata["sample_rows"][0]["source"] == "https://example.test/a"
+
+
+def test_document_placeholder_exposes_unsupported_structure() -> None:
+    payload = source_payload(b"%PDF-1.7\n", media_type="application/pdf", filename="brief.pdf")
+
+    result = DocumentPlaceholderExtractor().extract(payload)
+
+    assert result.normalized.structure["kind"] == "pdf"
+    assert result.normalized.fields["unsupported_count"] == 1
+    assert result.normalized.fields["link_count"] == 0
+    assert result.normalized.unsupported_elements[0]["reason"] == "deep_extraction_not_available"
+    assert result.diagnostics[0].code == "extraction.depth_unsupported"
+
+
+def source_payload(
+    content: str | bytes,
+    *,
+    media_type: str,
+    filename: str = "source.txt",
+) -> SourcePayload:
+    data = content.encode("utf-8") if isinstance(content, str) else content
+    source_ref = SourceReference(
+        source_system="test",
+        path=filename,
+        checksum=content_digest(data),
+        connector_ref=f"test:{filename}",
+    )
+    return SourcePayload(
+        connector_name="test",
+        source_uri=filename,
+        source_ref=source_ref,
+        media_type=media_type,
+        content=data,
+        title=filename.rsplit(".", maxsplit=1)[0],
+        metadata={"filename": filename},
+    )