richer normalized structure, permission context preservation

This commit is contained in:
2026-05-06 13:43:16 +02:00
parent a4a4759ac4
commit 24cb3c5b6a
10 changed files with 636 additions and 15 deletions

View File

@@ -7,14 +7,21 @@ from kontextual_engine import (
ActorType,
AssetIngestionService,
Classification,
ConnectorCapability,
ExtractionResult,
ExtractorCapability,
IngestionIdentityPolicy,
IngestionJobStatus,
InMemoryAssetRegistryRepository,
LifecycleState,
NormalizedDocument,
OperationContext,
RepresentationKind,
Sensitivity,
SourcePayload,
SourceReference,
SQLiteAssetRegistryRepository,
content_digest,
)
@@ -45,6 +52,9 @@ def test_asset_ingestion_service_ingests_plain_text_file_as_governed_asset(tmp_p
normalized = repo.list_representations(asset_id="asset-note", kind=RepresentationKind.NORMALIZED)[0]
assert normalized.media_type == "application/vnd.kontextual.normalized+json"
assert normalized.metadata["extractor"] == "plain-text"
assert normalized.metadata["line_count"] == 2
assert normalized.metadata["paragraph_count"] == 1
assert normalized.metadata["link_count"] == 0
assert repo.list_audit_events(target="asset:asset-note")[0].operation == "asset.create"
@@ -219,6 +229,75 @@ def test_document_placeholder_formats_create_asset_with_unsupported_depth_diagno
assert normalized.metadata["unsupported_elements"][0]["reason"] == "deep_extraction_not_available"
def test_ingestion_quarantines_empty_normalized_output_without_asset(tmp_path: Path) -> None:
source = tmp_path / "emptyish.txt"
source.write_text("content that the bad extractor drops", encoding="utf-8")
repo = InMemoryAssetRegistryRepository()
service = AssetIngestionService(repo, extractors=[EmptyTextExtractor()])
result = service.ingest_file(source, operation_context(), asset_id="asset-emptyish")
assert result.asset is None
assert result.action == "quarantined"
assert result.job.status == IngestionJobStatus.QUARANTINED
assert result.job.failures[0].code == "ingestion.normalized_empty"
assert result.job.partial_results["action"] == "quarantined"
assert repo.list_assets() == []
def test_ingestion_preserves_source_permission_context_on_representations() -> None:
repo = InMemoryAssetRegistryRepository()
service = AssetIngestionService(repo, connectors=[PermissionedConnector()])
result = service.ingest_file("permissioned.txt", operation_context(), asset_id="asset-permissioned")
source = repo.list_representations(asset_id="asset-permissioned", kind=RepresentationKind.SOURCE)[0]
normalized = repo.list_representations(asset_id="asset-permissioned", kind=RepresentationKind.NORMALIZED)[0]
permission_records = [
record for record in repo.list_metadata_records("asset-permissioned")
if record.key == "source_permission_context"
]
assert result.job.status == IngestionJobStatus.COMPLETED
assert source.metadata["permission_context"] == {
"ingest_allowed": True,
"labels": ["engineering"],
}
assert normalized.metadata["permission_context"]["labels"] == ["engineering"]
assert permission_records[0].value["ingest_allowed"] is True
def test_ingestion_quarantines_permission_denied_source_without_asset() -> None:
repo = InMemoryAssetRegistryRepository()
service = AssetIngestionService(repo, connectors=[PermissionedConnector(ingest_allowed=False)])
result = service.ingest_file("denied.txt", operation_context(), asset_id="asset-denied")
assert result.asset is None
assert result.job.status == IngestionJobStatus.QUARANTINED
assert result.job.failures[0].code == "ingestion.permission_denied"
assert repo.list_assets() == []
def test_directory_ingestion_reports_quarantined_failed_and_retriable_counts(tmp_path: Path) -> None:
(tmp_path / "bad-normalized.txt").write_text("dropped", encoding="utf-8")
(tmp_path / "unsupported.bin").write_bytes(b"\x00\x01")
repo = InMemoryAssetRegistryRepository()
service = AssetIngestionService(repo, extractors=[EmptyTextExtractor()])
job = service.ingest_directory(tmp_path, operation_context(), recursive=False)
items = {Path(item["source_uri"]).name: item for item in job.partial_results["items"]}
assert job.status == IngestionJobStatus.FAILED
assert job.partial_results["succeeded"] == 0
assert job.partial_results["failed"] == 1
assert job.partial_results["quarantined"] == 1
assert job.partial_results["retriable"] == 1
assert items["bad-normalized.txt"]["status"] == IngestionJobStatus.QUARANTINED.value
assert items["bad-normalized.txt"]["failures"][0]["code"] == "ingestion.normalized_empty"
assert items["unsupported.bin"]["retry_state"] == "retriable"
def test_sqlite_ingestion_jobs_survive_reinstantiation(tmp_path: Path) -> None:
source = tmp_path / "policy.txt"
source.write_text("governed ingestion", encoding="utf-8")
@@ -249,3 +328,62 @@ def operation_context() -> OperationContext:
groups=["engineering"],
)
return OperationContext.create(actor, correlation_id="corr-ingest")
class EmptyTextExtractor:
name = "empty-text"
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=("text/plain",),
extraction_depth="text",
)
def supports(self, media_type: str) -> bool:
return media_type == "text/plain"
def extract(self, payload: SourcePayload) -> ExtractionResult:
return ExtractionResult(
normalized=NormalizedDocument(text="", confidence=1.0),
metadata={
"extractor": self.name,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
},
)
class PermissionedConnector:
name = "local_file"
def __init__(self, *, ingest_allowed: bool = True) -> None:
self.ingest_allowed = ingest_allowed
def capabilities(self) -> ConnectorCapability:
return ConnectorCapability(
connector_name=self.name,
source_types=("file",),
supports_directories=False,
)
def fetch(self, source_uri: str) -> SourcePayload:
content = b"permissioned content"
return SourcePayload(
connector_name=self.name,
source_uri=source_uri,
source_ref=SourceReference(
source_system=self.name,
path=source_uri,
checksum=content_digest(content),
connector_ref=f"{self.name}:{source_uri}",
),
media_type="text/plain",
content=content,
title=Path(source_uri).stem,
metadata={"filename": Path(source_uri).name},
permission_context={
"ingest_allowed": self.ingest_allowed,
"labels": ["engineering"],
},
)

View File

@@ -36,6 +36,11 @@ def test_markitect_markdown_extractor_delegates_to_markitect_tool(
return SimpleNamespace(
to_dict=lambda: {
"frontmatter": {"status": "accepted"},
"blocks": [
{"type": "heading", "text": "Decision", "line_start": 1, "heading_level": 1},
{"type": "paragraph", "text": "Use Markitect.", "line_start": 3},
{"type": "table", "text": "| A |\n| - |", "line_start": 5, "line_end": 6},
],
"headings": [{"level": 1, "text": "Decision", "line": 1}],
"sections": [
{
@@ -43,6 +48,17 @@ def test_markitect_markdown_extractor_delegates_to_markitect_tool(
"blocks": [{"type": "paragraph", "text": "Use Markitect.", "line_start": 3}],
}
],
"tokens": [
{
"type": "inline",
"children": [
{
"type": "link_open",
"attrs": {"href": "https://example.test/decision"},
}
],
}
],
}
)
@@ -73,8 +89,16 @@ def test_markitect_markdown_extractor_delegates_to_markitect_tool(
("snapshot_identity_for_file", f"{source}:default"),
]
assert result.normalized.structure["frontmatter"] == {"status": "accepted"}
assert result.normalized.structure["blocks"][1]["type"] == "paragraph"
assert result.normalized.links == [
{"url": "https://example.test/decision", "kind": "markdown_link"}
]
assert result.normalized.tables[0]["text"] == "| A |\n| - |"
assert result.normalized.fields["block_count"] == 3
assert result.normalized.fields["heading_count"] == 1
assert result.normalized.fields["section_count"] == 1
assert result.normalized.fields["link_count"] == 1
assert result.normalized.fields["table_count"] == 1
assert result.metadata["snapshot"]["snapshot_id"] == "snapshot:decision"
assert result.normalized.extractor_metadata["snapshot"]["parser"] == "markdown-it-py/commonmark"

View File

@@ -0,0 +1,91 @@
from kontextual_engine import SourcePayload, SourceReference, content_digest
from kontextual_engine.adapters.builtin_extractors import (
CsvDatasetExtractor,
DocumentPlaceholderExtractor,
PlainTextExtractor,
)
def test_plain_text_extractor_emits_structural_units_and_links() -> None:
payload = source_payload(
"Intro line\nwith https://example.test/ref\n\nSecond paragraph\n",
media_type="text/plain",
)
result = PlainTextExtractor().extract(payload)
assert result.normalized.structure["kind"] == "plain_text"
assert result.normalized.fields["line_count"] == 4
assert result.normalized.fields["paragraph_count"] == 2
assert result.normalized.links == [
{
"url": "https://example.test/ref",
"line": 2,
"start": 5,
"end": 29,
}
]
assert result.metadata["link_count"] == 1
def test_csv_dataset_extractor_emits_table_schema_samples_and_links() -> None:
payload = source_payload(
"name,source\nalpha,https://example.test/a\nbeta,\n",
media_type="text/csv",
filename="metrics.csv",
)
result = CsvDatasetExtractor().extract(payload)
assert result.normalized.structure["kind"] == "dataset"
assert result.normalized.structure["columns"] == [
{"name": "name", "index": 0},
{"name": "source", "index": 1},
]
assert result.normalized.tables[0]["row_count"] == 2
assert result.normalized.tables[0]["rows"][0]["name"] == "alpha"
assert result.normalized.links == [
{
"url": "https://example.test/a",
"table": 0,
"row": 0,
"column": "source",
}
]
assert result.metadata["sample_rows"][0]["source"] == "https://example.test/a"
def test_document_placeholder_exposes_unsupported_structure() -> None:
payload = source_payload(b"%PDF-1.7\n", media_type="application/pdf", filename="brief.pdf")
result = DocumentPlaceholderExtractor().extract(payload)
assert result.normalized.structure["kind"] == "pdf"
assert result.normalized.fields["unsupported_count"] == 1
assert result.normalized.fields["link_count"] == 0
assert result.normalized.unsupported_elements[0]["reason"] == "deep_extraction_not_available"
assert result.diagnostics[0].code == "extraction.depth_unsupported"
def source_payload(
content: str | bytes,
*,
media_type: str,
filename: str = "source.txt",
) -> SourcePayload:
data = content.encode("utf-8") if isinstance(content, str) else content
source_ref = SourceReference(
source_system="test",
path=filename,
checksum=content_digest(data),
connector_ref=f"test:{filename}",
)
return SourcePayload(
connector_name="test",
source_uri=filename,
source_ref=source_ref,
media_type=media_type,
content=data,
title=filename.rsplit(".", maxsplit=1)[0],
metadata={"filename": filename},
)