richer normalized structure, permission context preservation

This commit is contained in:
2026-05-06 13:43:16 +02:00
parent a4a4759ac4
commit 24cb3c5b6a
10 changed files with 636 additions and 15 deletions

View File

@@ -4,6 +4,7 @@ from __future__ import annotations
import csv
import io
import re
from typing import Any
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
@@ -31,9 +32,10 @@ class CsvDatasetExtractor:
reader = csv.DictReader(io.StringIO(text), delimiter=delimiter)
columns = list(reader.fieldnames or [])
rows = [dict(row) for row in reader]
links = _links_from_rows(rows)
table = {
"name": payload.title,
"columns": columns,
"columns": [{"name": column, "index": index} for index, column in enumerate(columns)],
"rows": rows,
"row_count": len(rows),
}
@@ -44,6 +46,9 @@ class CsvDatasetExtractor:
"column_count": len(columns),
"row_count": len(rows),
"table_count": 1,
"link_count": len(links),
"links": links,
"sample_rows": rows[:5],
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
}
@@ -53,15 +58,19 @@ class CsvDatasetExtractor:
structure={
"kind": "dataset",
"format": metadata["dataset_format"],
"columns": columns,
"columns": [{"name": column, "index": index} for index, column in enumerate(columns)],
"row_count": len(rows),
"table_count": 1,
"sample_rows": rows[:5],
},
tables=[table],
links=links,
fields={
"columns": columns,
"column_count": len(columns),
"row_count": len(rows),
"dataset_format": metadata["dataset_format"],
"link_count": len(links),
},
confidence=0.95,
extractor_metadata={
@@ -77,3 +86,21 @@ def _delimiter_for(payload: SourcePayload) -> str:
if payload.media_type == "text/tab-separated-values" or filename.endswith(".tsv"):
return "\t"
return ","
def _links_from_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]:
links: list[dict[str, Any]] = []
for row_index, row in enumerate(rows):
for column, value in row.items():
if not isinstance(value, str):
continue
for match in re.finditer(r"https?://[^\s<>)]+", value):
links.append(
{
"url": match.group(0),
"table": 0,
"row": row_index,
"column": column,
}
)
return links

View File

@@ -63,6 +63,9 @@ class DocumentPlaceholderExtractor:
"document_kind": document_kind,
"extraction_depth": "metadata_only",
"unsupported_elements": [unsupported],
"unsupported_count": 1,
"link_count": 0,
"table_count": 0,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
}
@@ -72,11 +75,15 @@ class DocumentPlaceholderExtractor:
structure={
"kind": document_kind,
"extraction_depth": "metadata_only",
"unsupported_elements": [unsupported],
},
fields={
"document_kind": document_kind,
"source_media_type": payload.media_type,
"source_size_bytes": payload.size_bytes,
"unsupported_count": 1,
"link_count": 0,
"table_count": 0,
},
confidence=0.0,
unsupported_elements=[unsupported],

View File

@@ -2,6 +2,8 @@
from __future__ import annotations
import re
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
@@ -13,8 +15,8 @@ class PlainTextExtractor:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="text",
produces_structure=False,
extraction_depth="text_structure",
produces_structure=True,
)
def supports(self, media_type: str) -> bool:
@@ -22,10 +24,23 @@ class PlainTextExtractor:
def extract(self, payload: SourcePayload) -> ExtractionResult:
text = payload.read_text()
lines = _lines(text)
paragraphs = _paragraphs(text)
links = _links(text)
normalized = NormalizedDocument(
title=payload.title,
text=text,
fields={"line_count": len(text.splitlines())},
structure={
"kind": "plain_text",
"lines": lines,
"paragraphs": paragraphs,
},
links=links,
fields={
"line_count": len(lines),
"paragraph_count": len(paragraphs),
"link_count": len(links),
},
confidence=1.0,
extractor_metadata={
"extractor": self.name,
@@ -36,7 +51,70 @@ class PlainTextExtractor:
normalized=normalized,
metadata={
"extractor": self.name,
"line_count": len(lines),
"paragraph_count": len(paragraphs),
"link_count": len(links),
"links": links,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
},
)
def _lines(text: str) -> list[dict[str, int | str]]:
return [
{
"index": index,
"line_number": index + 1,
"text": line,
}
for index, line in enumerate(text.splitlines())
]
def _paragraphs(text: str) -> list[dict[str, int | str]]:
paragraphs: list[dict[str, int | str]] = []
current: list[str] = []
start_line: int | None = None
for index, line in enumerate(text.splitlines(), start=1):
if line.strip():
if start_line is None:
start_line = index
current.append(line)
continue
if current and start_line is not None:
paragraphs.append(
{
"index": len(paragraphs),
"line_start": start_line,
"line_end": index - 1,
"text": "\n".join(current),
}
)
current = []
start_line = None
if current and start_line is not None:
paragraphs.append(
{
"index": len(paragraphs),
"line_start": start_line,
"line_end": start_line + len(current) - 1,
"text": "\n".join(current),
}
)
return paragraphs
def _links(text: str) -> list[dict[str, int | str]]:
links: list[dict[str, int | str]] = []
for line_index, line in enumerate(text.splitlines(), start=1):
for match in re.finditer(r"https?://[^\s<>)]+", line):
links.append(
{
"url": match.group(0),
"line": line_index,
"start": match.start(),
"end": match.end(),
}
)
return links

View File

@@ -42,8 +42,11 @@ class MarkitectMarkdownExtractor:
document = self._parse_document(mkt, text, source_path)
serialized = document.to_dict() if hasattr(document, "to_dict") else {}
snapshot = self._snapshot(mkt, source_path)
links = _links_from_tokens(list(serialized.get("tokens", [])))
tables = _tables_from_blocks(list(serialized.get("blocks", [])))
structure = {
"frontmatter": dict(serialized.get("frontmatter", {})),
"blocks": list(serialized.get("blocks", [])),
"headings": list(serialized.get("headings", [])),
"sections": list(serialized.get("sections", [])),
}
@@ -51,10 +54,15 @@ class MarkitectMarkdownExtractor:
title=payload.title,
text=text,
structure=structure,
tables=tables,
links=links,
fields={
"frontmatter": dict(serialized.get("frontmatter", {})),
"block_count": len(structure["blocks"]),
"heading_count": len(structure["headings"]),
"section_count": len(structure["sections"]),
"table_count": len(tables),
"link_count": len(links),
},
confidence=1.0,
extractor_metadata={
@@ -68,7 +76,12 @@ class MarkitectMarkdownExtractor:
metadata={
"extractor": self.name,
"frontmatter": structure["frontmatter"],
"blocks": structure["blocks"],
"headings": structure["headings"],
"sections": structure["sections"],
"links": links,
"link_count": len(links),
"table_count": len(tables),
"snapshot": snapshot,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
@@ -84,3 +97,52 @@ class MarkitectMarkdownExtractor:
if not source_path or not Path(source_path).exists() or not hasattr(mkt, "snapshot_identity_for_file"):
return {}
return mkt.snapshot_identity_for_file(Path(source_path), parse_options={"profile": "default"}).to_dict()
def _links_from_tokens(tokens: list[dict[str, Any]]) -> list[dict[str, Any]]:
links: list[dict[str, Any]] = []
for token in _walk_tokens(tokens):
if token.get("type") != "link_open":
continue
href = _attr_value(token.get("attrs"), "href")
if href:
links.append({"url": href, "kind": "markdown_link"})
return links
def _walk_tokens(tokens: list[dict[str, Any]]) -> list[dict[str, Any]]:
walked: list[dict[str, Any]] = []
for token in tokens:
walked.append(token)
children = token.get("children")
if isinstance(children, list):
walked.extend(_walk_tokens([child for child in children if isinstance(child, dict)]))
return walked
def _attr_value(attrs: Any, name: str) -> str | None:
if isinstance(attrs, dict):
value = attrs.get(name)
return str(value) if value is not None else None
if isinstance(attrs, list):
for item in attrs:
if isinstance(item, (list, tuple)) and len(item) == 2 and item[0] == name:
return str(item[1])
return None
def _tables_from_blocks(blocks: list[dict[str, Any]]) -> list[dict[str, Any]]:
tables: list[dict[str, Any]] = []
for index, block in enumerate(blocks):
if block.get("type") != "table":
continue
tables.append(
{
"index": len(tables),
"source_block_index": index,
"text": block.get("text", ""),
"line_start": block.get("line_start"),
"line_end": block.get("line_end"),
}
)
return tables

View File

@@ -2,7 +2,7 @@
from __future__ import annotations
from dataclasses import dataclass
from dataclasses import dataclass, replace
from pathlib import Path
from typing import Iterable
@@ -16,6 +16,7 @@ from kontextual_engine.adapters.markitect_tool import MarkitectMarkdownExtractor
from kontextual_engine.core import (
AssetRepresentation,
Classification,
ExtractionResult,
IngestionFailure,
IngestionIdentityPolicy,
IngestionJob,
@@ -178,14 +179,28 @@ class AssetIngestionService:
"failed": sum(1 for item in item_results if item["status"] == IngestionJobStatus.FAILED.value),
"quarantined": sum(1 for item in item_results if item["status"] == IngestionJobStatus.QUARANTINED.value),
"skipped": sum(1 for item in item_results if item["status"] == "skipped"),
"retriable": sum(1 for item in item_results if item.get("retry_state") == "retriable"),
"items": item_results,
}
failed_count = partial_results["failed"]
quarantined_count = partial_results["quarantined"]
if failures and output_asset_ids:
job = job.partially_completed(
output_asset_ids=tuple(output_asset_ids),
failures=tuple(failures),
partial_results=partial_results,
)
elif quarantined_count and not failed_count:
job = job.failed(
IngestionFailure(
code="ingestion.directory_quarantined",
message="Directory ingestion quarantined all non-skipped files",
retriable=False,
details=partial_results,
),
status=IngestionJobStatus.QUARANTINED,
partial_results=partial_results,
)
elif failures:
job = job.failed(
IngestionFailure(
@@ -224,6 +239,21 @@ class AssetIngestionService:
self.repository.save_ingestion_job(job)
extractor = self._extractor(payload.media_type)
extraction = extractor.extract(payload)
validation_failures = _validate_extraction(payload, extraction)
if validation_failures:
quarantined = _quarantined_job(
job,
validation_failures,
partial_results={
"action": "quarantined",
"connector": payload.connector_name,
"extractor": extraction.metadata.get("extractor"),
"source_digest": payload.content_digest,
"diagnostics": [failure.to_dict() for failure in validation_failures],
},
)
self.repository.save_ingestion_job(quarantined)
return AssetIngestionResult(quarantined, action="quarantined")
resolved_asset_id = asset_id or _stable_asset_id(payload, identity_policy)
existing_asset = _get_asset_or_none(self.repository, resolved_asset_id)
if existing_asset and skip_unchanged and _asset_has_source_reference(existing_asset, payload.source_ref):
@@ -254,6 +284,7 @@ class AssetIngestionService:
"connector": payload.connector_name,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
"permission_context": dict(payload.permission_context),
**payload.metadata,
},
)
@@ -267,6 +298,7 @@ class AssetIngestionService:
metadata={
"extractor": extractor.name,
"normalized_hash": extraction.normalized.normalized_hash,
"permission_context": dict(payload.permission_context),
**extraction.metadata,
},
)
@@ -380,7 +412,7 @@ def _metadata_records(
extractor_name: str,
extraction_metadata: dict,
) -> list[MetadataRecord]:
return [
records = [
MetadataRecord("source_media_type", payload.media_type, provenance={"producer": payload.connector_name}),
MetadataRecord("source_digest", payload.content_digest, provenance={"producer": payload.connector_name}),
MetadataRecord("source_size_bytes", payload.size_bytes, provenance={"producer": payload.connector_name}),
@@ -388,6 +420,120 @@ def _metadata_records(
MetadataRecord("extractor", extractor_name, provenance={"producer": extractor_name}, confirmed=True),
MetadataRecord("extraction", dict(extraction_metadata), provenance={"producer": extractor_name}),
]
if payload.permission_context:
records.append(
MetadataRecord(
"source_permission_context",
dict(payload.permission_context),
provenance={"producer": payload.connector_name},
confirmed=True,
)
)
return records
def _validate_extraction(
payload: SourcePayload,
extraction: ExtractionResult,
) -> list[IngestionFailure]:
failures: list[IngestionFailure] = []
if payload.permission_context.get("ingest_allowed") is False:
failures.append(
IngestionFailure(
code="ingestion.permission_denied",
message="Source permission context does not allow ingestion",
retriable=False,
details={"permission_context": dict(payload.permission_context)},
)
)
if not payload.source_ref.source_system:
failures.append(
IngestionFailure(
code="ingestion.source_system_missing",
message="Source reference is missing source system",
retriable=False,
details={"source_uri": payload.source_uri},
)
)
if not payload.source_ref.checksum:
failures.append(
IngestionFailure(
code="ingestion.source_checksum_missing",
message="Source reference is missing checksum provenance",
retriable=False,
details={"source_uri": payload.source_uri},
)
)
elif payload.source_ref.checksum != payload.content_digest:
failures.append(
IngestionFailure(
code="ingestion.source_checksum_mismatch",
message="Source reference checksum does not match payload content",
retriable=True,
details={
"source_uri": payload.source_uri,
"source_checksum": payload.source_ref.checksum,
"payload_digest": payload.content_digest,
},
)
)
if not extraction.metadata.get("extractor"):
failures.append(
IngestionFailure(
code="ingestion.extractor_metadata_missing",
message="Extraction result is missing extractor provenance",
retriable=False,
details={"source_uri": payload.source_uri},
)
)
normalized = extraction.normalized
has_normalized_output = bool(
normalized.text.strip()
or normalized.structure
or normalized.tables
or normalized.links
or normalized.fields
or normalized.unsupported_elements
)
if not has_normalized_output:
failures.append(
IngestionFailure(
code="ingestion.normalized_empty",
message="Extraction produced no normalized content, structure, fields, or diagnostics",
retriable=False,
details={"source_uri": payload.source_uri, "media_type": payload.media_type},
)
)
if (
normalized.confidence is not None
and normalized.confidence < 0.2
and not normalized.unsupported_elements
):
failures.append(
IngestionFailure(
code="ingestion.extraction_low_confidence",
message="Extraction confidence is below trusted ingestion threshold",
retriable=False,
details={"confidence": normalized.confidence, "threshold": 0.2},
)
)
return failures
def _quarantined_job(
job: IngestionJob,
failures: list[IngestionFailure],
*,
partial_results: dict,
) -> IngestionJob:
quarantined = job.failed(
failures[0],
status=IngestionJobStatus.QUARANTINED,
partial_results=partial_results,
)
if len(failures) > 1:
quarantined = replace(quarantined, failures=tuple(failures))
return quarantined
def _failure_from_exception(exc: Exception) -> IngestionFailure: