first ingestion/normalization slice

This commit is contained in:
2026-05-06 02:35:40 +02:00
parent 286ebc3cb6
commit 565a5643a3
19 changed files with 1231 additions and 10 deletions

View File

@@ -22,15 +22,22 @@ from .core import (
AuditEvent,
AuditOutcome,
Classification,
ConnectorCapability,
ContextEntity,
ContextEntityType,
CoreRelationship,
DerivedArtifactLineage,
ExtractionResult,
ExtractorCapability,
IdempotencyRecord,
IdempotencyStatus,
IngestionFailure,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
LifecycleState,
MetadataRecord,
NormalizedDocument,
OperationContext,
PolicyDecision,
PolicyEffect,
@@ -38,6 +45,7 @@ from .core import (
RepresentationKind,
Sensitivity,
SourceReference,
SourcePayload,
VersionChangeType,
)
from .errors import (
@@ -50,10 +58,23 @@ from .errors import (
ValidationError,
)
from .ingestion import IngestionRequest, IngestionResult, IngestionService
from .ports import AllowAllPolicyGateway, AssetRegistryRepository, PolicyGateway
from .ports import (
AllowAllPolicyGateway,
AssetRegistryRepository,
DirectorySourceConnector,
FormatExtractor,
PolicyGateway,
SourceConnector,
)
from .query import QueryEngine, QueryResult
from .relationships import RelationshipGraph
from .services import AssetChangeResult, AssetRegistryService, RelationshipChangeResult
from .services import (
AssetChangeResult,
AssetIngestionResult,
AssetIngestionService,
AssetRegistryService,
RelationshipChangeResult,
)
from .storage import InMemoryKnowledgeRepository
from .workflows import (
InputBundle,
@@ -76,6 +97,8 @@ __all__ = [
"ActorType",
"AssetRepresentation",
"AssetChangeResult",
"AssetIngestionResult",
"AssetIngestionService",
"AssetRegistryRepository",
"AssetRegistryService",
"AssetVersion",
@@ -83,6 +106,7 @@ __all__ = [
"AuditOutcome",
"AuthorizationError",
"Classification",
"ConnectorCapability",
"Collection",
"ContextAssembler",
"ContextEntity",
@@ -92,12 +116,19 @@ __all__ = [
"CoreRelationship",
"DerivedArtifactLineage",
"Diagnostic",
"DirectorySourceConnector",
"DuplicateResourceError",
"ExtractionResult",
"ExtractorCapability",
"FormatExtractor",
"InMemoryAssetRegistryRepository",
"InMemoryKnowledgeRepository",
"IngestionRequest",
"IngestionResult",
"IngestionService",
"IngestionFailure",
"IngestionJob",
"IngestionJobStatus",
"InputBundle",
"IdempotencyRecord",
"IdempotencyStatus",
@@ -105,6 +136,7 @@ __all__ = [
"KontextualError",
"LifecycleState",
"MetadataRecord",
"NormalizedDocument",
"NotFoundError",
"OperationRun",
"OperationStage",
@@ -124,6 +156,8 @@ __all__ = [
"RunStatus",
"Sensitivity",
"SourceReference",
"SourceConnector",
"SourcePayload",
"SQLiteAssetRegistryRepository",
"ValidationError",
"VersionChangeType",

View File

@@ -0,0 +1,5 @@
"""Built-in baseline format extractors."""
from .text import PlainTextExtractor
__all__ = ["PlainTextExtractor"]

View File

@@ -0,0 +1,42 @@
"""Plain text normalization extractor."""
from __future__ import annotations
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
class PlainTextExtractor:
name = "plain-text"
media_types = ("text/plain",)
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="text",
produces_structure=False,
)
def supports(self, media_type: str) -> bool:
return media_type in self.media_types or media_type.startswith("text/plain")
def extract(self, payload: SourcePayload) -> ExtractionResult:
text = payload.read_text()
normalized = NormalizedDocument(
title=payload.title,
text=text,
fields={"line_count": len(text.splitlines())},
confidence=1.0,
extractor_metadata={
"extractor": self.name,
"source_media_type": payload.media_type,
},
)
return ExtractionResult(
normalized=normalized,
metadata={
"extractor": self.name,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
},
)

View File

@@ -0,0 +1,5 @@
"""Local filesystem ingestion connector."""
from .connector import LocalFileConnector
__all__ = ["LocalFileConnector"]

View File

@@ -0,0 +1,77 @@
"""Local file and directory source connector."""
from __future__ import annotations
import mimetypes
from pathlib import Path
from typing import Any
from kontextual_engine.core import ConnectorCapability, SourcePayload, SourceReference, content_digest
from kontextual_engine.errors import NotFoundError, ValidationError
class LocalFileConnector:
name = "local_file"
def capabilities(self) -> ConnectorCapability:
return ConnectorCapability(
connector_name=self.name,
source_types=("file", "directory"),
supports_directories=True,
metadata={"uri_schemes": ["file", "path"]},
)
def fetch(self, source_uri: str) -> SourcePayload:
path = Path(source_uri).expanduser()
if not path.exists():
raise NotFoundError("Local source file not found", details={"path": str(path)})
if not path.is_file():
raise ValidationError("Local source is not a file", details={"path": str(path)})
content = path.read_bytes()
media_type = _guess_media_type(path)
source_ref = SourceReference(
source_system=self.name,
path=str(path),
checksum=content_digest(content),
connector_ref=f"{self.name}:{path.resolve()}",
metadata=_file_metadata(path),
)
return SourcePayload(
connector_name=self.name,
source_uri=str(path),
source_ref=source_ref,
media_type=media_type,
content=content,
title=path.stem,
metadata={"filename": path.name, **_file_metadata(path)},
)
def iter_files(self, source_uri: str, *, recursive: bool = True) -> list[str]:
root = Path(source_uri).expanduser()
if not root.exists():
raise NotFoundError("Local source directory not found", details={"path": str(root)})
if root.is_file():
return [str(root)]
if not root.is_dir():
raise ValidationError("Local source is not a directory", details={"path": str(root)})
pattern = "**/*" if recursive else "*"
return sorted(str(path) for path in root.glob(pattern) if path.is_file())
def _guess_media_type(path: Path) -> str:
suffix = path.suffix.lower()
if suffix in {".md", ".markdown", ".mkd"}:
return "text/markdown"
if suffix in {".txt", ".text", ".log"}:
return "text/plain"
guessed, _ = mimetypes.guess_type(path.name)
return guessed or "application/octet-stream"
def _file_metadata(path: Path) -> dict[str, Any]:
stat = path.stat()
return {
"size_bytes": stat.st_size,
"mtime_ns": stat.st_mtime_ns,
}

View File

@@ -0,0 +1,5 @@
"""markitect-tool ingestion adapter boundary."""
from .markdown import MarkitectMarkdownExtractor
__all__ = ["MarkitectMarkdownExtractor"]

View File

@@ -0,0 +1,86 @@
"""Markdown normalization through markitect-tool."""
from __future__ import annotations
from pathlib import Path
from typing import Any
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
from kontextual_engine.errors import AdapterUnavailableError
class MarkitectMarkdownExtractor:
"""Adapter boundary to markitect-tool; Markdown syntax logic stays external."""
name = "markitect-tool"
media_types = ("text/markdown", "text/x-markdown")
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="structure",
produces_structure=True,
optional_dependency="markitect-tool",
metadata={"delegates_markdown_syntax": True},
)
def supports(self, media_type: str) -> bool:
return media_type in self.media_types
def extract(self, payload: SourcePayload) -> ExtractionResult:
try:
import markitect_tool as mkt
except Exception as exc: # pragma: no cover - depends on optional environment
raise AdapterUnavailableError(
"markitect-tool is required for markdown normalization",
details={"adapter": self.name, "media_type": payload.media_type},
) from exc
source_path = payload.source_ref.path
text = payload.read_text()
document = self._parse_document(mkt, text, source_path)
serialized = document.to_dict() if hasattr(document, "to_dict") else {}
snapshot = self._snapshot(mkt, source_path)
structure = {
"frontmatter": dict(serialized.get("frontmatter", {})),
"headings": list(serialized.get("headings", [])),
"sections": list(serialized.get("sections", [])),
}
normalized = NormalizedDocument(
title=payload.title,
text=text,
structure=structure,
fields={
"frontmatter": dict(serialized.get("frontmatter", {})),
"heading_count": len(structure["headings"]),
"section_count": len(structure["sections"]),
},
confidence=1.0,
extractor_metadata={
"extractor": self.name,
"source_media_type": payload.media_type,
"snapshot": snapshot,
},
)
return ExtractionResult(
normalized=normalized,
metadata={
"extractor": self.name,
"frontmatter": structure["frontmatter"],
"headings": structure["headings"],
"snapshot": snapshot,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
},
)
def _parse_document(self, mkt: Any, text: str, source_path: str | None) -> Any:
if source_path and Path(source_path).exists() and hasattr(mkt, "parse_markdown_file"):
return mkt.parse_markdown_file(Path(source_path))
return mkt.parse_markdown(text, source_path=source_path)
def _snapshot(self, mkt: Any, source_path: str | None) -> dict[str, Any]:
if not source_path or not Path(source_path).exists() or not hasattr(mkt, "snapshot_identity_for_file"):
return {}
return mkt.snapshot_identity_for_file(Path(source_path), parse_options={"profile": "default"}).to_dict()

View File

@@ -13,6 +13,8 @@ from kontextual_engine.core import (
ContextEntity,
CoreRelationship,
IdempotencyRecord,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
LifecycleState,
MetadataRecord,
@@ -32,6 +34,7 @@ class InMemoryAssetRegistryRepository:
versions: dict[str, list[AssetVersion]] = field(default_factory=dict)
audit_events: dict[str, AuditEvent] = field(default_factory=dict)
idempotency_records: dict[str, IdempotencyRecord] = field(default_factory=dict)
ingestion_jobs: dict[str, IngestionJob] = field(default_factory=dict)
def save_actor(self, actor: Actor) -> Actor:
self.actors[actor.id] = actor
@@ -190,3 +193,23 @@ class InMemoryAssetRegistryRepository:
def get_idempotency_record(self, key: str) -> IdempotencyRecord | None:
return self.idempotency_records.get(key)
def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
self.ingestion_jobs[job.job_id] = job
return job
def get_ingestion_job(self, job_id: str) -> IngestionJob:
try:
return self.ingestion_jobs[job_id]
except KeyError as exc:
raise NotFoundError("Ingestion job not found", details={"job_id": job_id}) from exc
def list_ingestion_jobs(
self,
*,
status: IngestionJobStatus | None = None,
) -> list[IngestionJob]:
jobs: Iterable[IngestionJob] = self.ingestion_jobs.values()
if status is not None:
jobs = [job for job in jobs if job.status == status]
return sorted(jobs, key=lambda job: (job.created_at, job.job_id))

View File

@@ -15,6 +15,8 @@ from kontextual_engine.core import (
ContextEntity,
CoreRelationship,
IdempotencyRecord,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
LifecycleState,
MetadataRecord,
@@ -381,6 +383,51 @@ class SQLiteAssetRegistryRepository:
return None
return IdempotencyRecord.from_dict(_loads(row["payload"]))
def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
with self._connect() as conn:
conn.execute(
"""
insert into ingestion_jobs (id, status, actor_id, correlation_id, created_at, updated_at, payload)
values (?, ?, ?, ?, ?, ?, ?)
on conflict(id) do update set
status=excluded.status,
actor_id=excluded.actor_id,
correlation_id=excluded.correlation_id,
updated_at=excluded.updated_at,
payload=excluded.payload
""",
(
job.job_id,
job.status.value,
job.actor_id,
job.correlation_id,
job.created_at,
job.updated_at,
_json(job.to_dict()),
),
)
return job
def get_ingestion_job(self, job_id: str) -> IngestionJob:
row = self._one("select payload from ingestion_jobs where id = ?", (job_id,))
if row is None:
raise NotFoundError("Ingestion job not found", details={"job_id": job_id})
return IngestionJob.from_dict(_loads(row["payload"]))
def list_ingestion_jobs(
self,
*,
status: IngestionJobStatus | None = None,
) -> list[IngestionJob]:
if status is None:
rows = self._all("select payload from ingestion_jobs order by created_at, id", ())
else:
rows = self._all(
"select payload from ingestion_jobs where status = ? order by created_at, id",
(status.value,),
)
return [IngestionJob.from_dict(_loads(row["payload"])) for row in rows]
def _initialize(self) -> None:
with self._connect() as conn:
conn.executescript(
@@ -449,6 +496,15 @@ class SQLiteAssetRegistryRepository:
status text not null,
payload text not null
);
create table if not exists ingestion_jobs (
id text primary key,
status text not null,
actor_id text not null,
correlation_id text not null,
created_at text not null,
updated_at text not null,
payload text not null
);
create index if not exists idx_assets_lifecycle on assets(lifecycle);
create index if not exists idx_representations_asset on representations(asset_id);
create index if not exists idx_metadata_asset on metadata_records(asset_id);
@@ -458,6 +514,8 @@ class SQLiteAssetRegistryRepository:
create index if not exists idx_versions_asset on asset_versions(asset_id);
create index if not exists idx_audit_target on audit_events(target);
create index if not exists idx_audit_correlation on audit_events(correlation_id);
create index if not exists idx_ingestion_jobs_status on ingestion_jobs(status);
create index if not exists idx_ingestion_jobs_correlation on ingestion_jobs(correlation_id);
"""
)

View File

@@ -4,6 +4,16 @@ from .actors import Actor, ActorType, OperationContext
from .assets import AssetRepresentation, KnowledgeAsset, RepresentationKind
from .audit import AuditEvent, AuditOutcome
from .idempotency import IdempotencyRecord, IdempotencyStatus
from .ingestion import (
ConnectorCapability,
ExtractionResult,
ExtractorCapability,
IngestionFailure,
IngestionJob,
IngestionJobStatus,
NormalizedDocument,
SourcePayload,
)
from .metadata import Classification, LifecycleState, MetadataRecord, Sensitivity
from .policy import PolicyDecision, PolicyEffect
from .primitives import content_digest, mapping_digest, new_id, stable_json_dumps, utc_now
@@ -28,15 +38,22 @@ __all__ = [
"AuditEvent",
"AuditOutcome",
"Classification",
"ConnectorCapability",
"ContextEntity",
"ContextEntityType",
"CoreRelationship",
"DerivedArtifactLineage",
"ExtractionResult",
"ExtractorCapability",
"IdempotencyRecord",
"IdempotencyStatus",
"IngestionFailure",
"IngestionJob",
"IngestionJobStatus",
"KnowledgeAsset",
"LifecycleState",
"MetadataRecord",
"NormalizedDocument",
"OperationContext",
"PolicyDecision",
"PolicyEffect",
@@ -44,6 +61,7 @@ __all__ = [
"RepresentationKind",
"Sensitivity",
"SourceReference",
"SourcePayload",
"VersionChangeType",
"content_digest",
"mapping_digest",

View File

@@ -0,0 +1,308 @@
"""Ingestion job and normalized content primitives."""
from __future__ import annotations
from dataclasses import dataclass, field, replace
from enum import Enum
from typing import Any
from .primitives import compact_dict, content_digest, mapping_digest, new_id, stable_json_dumps, utc_now
from .provenance import SourceReference
class IngestionJobStatus(str, Enum):
QUEUED = "queued"
RUNNING = "running"
COMPLETED = "completed"
FAILED = "failed"
PARTIALLY_COMPLETED = "partially_completed"
RETRIED = "retried"
QUARANTINED = "quarantined"
CANCELED = "canceled"
@dataclass(frozen=True)
class IngestionFailure:
code: str
message: str
retriable: bool = False
details: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"code": self.code,
"message": self.message,
"retriable": self.retriable,
"details": dict(self.details),
}
)
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "IngestionFailure":
return cls(
code=data["code"],
message=data["message"],
retriable=bool(data.get("retriable", False)),
details=dict(data.get("details", {})),
)
@dataclass(frozen=True)
class ConnectorCapability:
connector_name: str
source_types: tuple[str, ...]
supports_directories: bool = False
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"connector_name": self.connector_name,
"source_types": list(self.source_types),
"supports_directories": self.supports_directories,
"metadata": dict(self.metadata),
}
)
@dataclass(frozen=True)
class ExtractorCapability:
extractor_name: str
media_types: tuple[str, ...]
extraction_depth: str = "text"
produces_structure: bool = False
optional_dependency: str | None = None
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"extractor_name": self.extractor_name,
"media_types": list(self.media_types),
"extraction_depth": self.extraction_depth,
"produces_structure": self.produces_structure,
"optional_dependency": self.optional_dependency,
"metadata": dict(self.metadata),
}
)
@dataclass(frozen=True)
class SourcePayload:
connector_name: str
source_uri: str
source_ref: SourceReference
media_type: str
content: bytes
title: str
metadata: dict[str, Any] = field(default_factory=dict)
permission_context: dict[str, Any] = field(default_factory=dict)
@property
def content_digest(self) -> str:
return content_digest(self.content)
@property
def size_bytes(self) -> int:
return len(self.content)
def read_text(self, encoding: str = "utf-8") -> str:
return self.content.decode(encoding)
@dataclass(frozen=True)
class NormalizedDocument:
text: str
media_type: str = "application/vnd.kontextual.normalized+json"
title: str | None = None
structure: dict[str, Any] = field(default_factory=dict)
tables: list[dict[str, Any]] = field(default_factory=list)
links: list[dict[str, Any]] = field(default_factory=list)
fields: dict[str, Any] = field(default_factory=dict)
confidence: float | None = None
unsupported_elements: list[dict[str, Any]] = field(default_factory=list)
extractor_metadata: dict[str, Any] = field(default_factory=dict)
@property
def normalized_hash(self) -> str:
return mapping_digest(self.to_dict(include_hash=False))
def to_dict(self, *, include_hash: bool = True) -> dict[str, Any]:
data = compact_dict(
{
"title": self.title,
"text": self.text,
"media_type": self.media_type,
"structure": dict(self.structure),
"tables": list(self.tables),
"links": list(self.links),
"fields": dict(self.fields),
"confidence": self.confidence,
"unsupported_elements": list(self.unsupported_elements),
"extractor_metadata": dict(self.extractor_metadata),
}
)
if include_hash:
data["normalized_hash"] = self.normalized_hash
return data
def to_json(self) -> str:
return stable_json_dumps(self.to_dict())
@dataclass(frozen=True)
class ExtractionResult:
normalized: NormalizedDocument
metadata: dict[str, Any] = field(default_factory=dict)
diagnostics: tuple[IngestionFailure, ...] = ()
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"normalized": self.normalized.to_dict(),
"metadata": dict(self.metadata),
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
}
)
@dataclass(frozen=True)
class IngestionJob:
input: dict[str, Any]
actor_id: str
correlation_id: str
status: IngestionJobStatus = IngestionJobStatus.QUEUED
source_ref: SourceReference | None = None
output_asset_ids: tuple[str, ...] = ()
failures: tuple[IngestionFailure, ...] = ()
partial_results: dict[str, Any] = field(default_factory=dict)
retry_options: dict[str, Any] = field(default_factory=dict)
retry_of_job_id: str | None = None
attempts: int = 1
metadata: dict[str, Any] = field(default_factory=dict)
job_id: str = field(default_factory=lambda: new_id("ingest"))
created_at: str = field(default_factory=lambda: utc_now().isoformat())
updated_at: str = field(default_factory=lambda: utc_now().isoformat())
completed_at: str | None = None
@classmethod
def create(
cls,
*,
input: dict[str, Any],
actor_id: str,
correlation_id: str,
retry_of_job_id: str | None = None,
metadata: dict[str, Any] | None = None,
) -> "IngestionJob":
return cls(
input=dict(input),
actor_id=actor_id,
correlation_id=correlation_id,
retry_of_job_id=retry_of_job_id,
metadata=dict(metadata or {}),
)
def running(self, *, source_ref: SourceReference | None = None) -> "IngestionJob":
return replace(
self,
status=IngestionJobStatus.RUNNING,
source_ref=source_ref or self.source_ref,
updated_at=utc_now().isoformat(),
)
def completed(
self,
*,
output_asset_ids: tuple[str, ...],
partial_results: dict[str, Any] | None = None,
) -> "IngestionJob":
now = utc_now().isoformat()
return replace(
self,
status=IngestionJobStatus.COMPLETED,
output_asset_ids=tuple(output_asset_ids),
partial_results=dict(partial_results or self.partial_results),
updated_at=now,
completed_at=now,
)
def failed(
self,
failure: IngestionFailure,
*,
status: IngestionJobStatus = IngestionJobStatus.FAILED,
partial_results: dict[str, Any] | None = None,
) -> "IngestionJob":
now = utc_now().isoformat()
return replace(
self,
status=status,
failures=self.failures + (failure,),
partial_results=dict(partial_results or self.partial_results),
updated_at=now,
completed_at=now,
)
def partially_completed(
self,
*,
output_asset_ids: tuple[str, ...],
failures: tuple[IngestionFailure, ...],
partial_results: dict[str, Any],
) -> "IngestionJob":
now = utc_now().isoformat()
return replace(
self,
status=IngestionJobStatus.PARTIALLY_COMPLETED,
output_asset_ids=tuple(output_asset_ids),
failures=tuple(failures),
partial_results=dict(partial_results),
updated_at=now,
completed_at=now,
)
def to_dict(self) -> dict[str, Any]:
return compact_dict(
{
"job_id": self.job_id,
"status": self.status.value,
"input": dict(self.input),
"actor_id": self.actor_id,
"correlation_id": self.correlation_id,
"source_ref": self.source_ref.to_dict() if self.source_ref else None,
"output_asset_ids": list(self.output_asset_ids),
"failures": [failure.to_dict() for failure in self.failures],
"partial_results": dict(self.partial_results),
"retry_options": dict(self.retry_options),
"retry_of_job_id": self.retry_of_job_id,
"attempts": self.attempts,
"metadata": dict(self.metadata),
"created_at": self.created_at,
"updated_at": self.updated_at,
"completed_at": self.completed_at,
}
)
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "IngestionJob":
source_ref = data.get("source_ref")
return cls(
job_id=data["job_id"],
status=IngestionJobStatus(data["status"]),
input=dict(data.get("input", {})),
actor_id=data["actor_id"],
correlation_id=data["correlation_id"],
source_ref=SourceReference.from_dict(source_ref) if source_ref else None,
output_asset_ids=tuple(data.get("output_asset_ids", [])),
failures=tuple(IngestionFailure.from_dict(item) for item in data.get("failures", [])),
partial_results=dict(data.get("partial_results", {})),
retry_options=dict(data.get("retry_options", {})),
retry_of_job_id=data.get("retry_of_job_id"),
attempts=int(data.get("attempts", 1)),
metadata=dict(data.get("metadata", {})),
created_at=data["created_at"],
updated_at=data["updated_at"],
completed_at=data.get("completed_at"),
)

View File

@@ -1,11 +1,14 @@
"""Stable ports owned by the engine."""
from .ingestion import DirectorySourceConnector, FormatExtractor, SourceConnector
from .policy import AllowAllPolicyGateway, PolicyGateway
from .repositories import AssetRegistryRepository
__all__ = [
"AllowAllPolicyGateway",
"AssetRegistryRepository",
"DirectorySourceConnector",
"FormatExtractor",
"PolicyGateway",
"SourceConnector",
]

View File

@@ -0,0 +1,34 @@
"""Connector and extractor ports for ingestion."""
from __future__ import annotations
from typing import Protocol
from kontextual_engine.core import (
ConnectorCapability,
ExtractionResult,
ExtractorCapability,
SourcePayload,
)
class SourceConnector(Protocol):
name: str
def capabilities(self) -> ConnectorCapability: ...
def fetch(self, source_uri: str) -> SourcePayload: ...
class DirectorySourceConnector(SourceConnector, Protocol):
def iter_files(self, source_uri: str, *, recursive: bool = True) -> list[str]: ...
class FormatExtractor(Protocol):
name: str
def capabilities(self) -> ExtractorCapability: ...
def supports(self, media_type: str) -> bool: ...
def extract(self, payload: SourcePayload) -> ExtractionResult: ...

View File

@@ -12,6 +12,8 @@ from kontextual_engine.core import (
ContextEntity,
CoreRelationship,
IdempotencyRecord,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
LifecycleState,
MetadataRecord,
@@ -71,3 +73,11 @@ class AssetRegistryRepository(Protocol):
def save_idempotency_record(self, record: IdempotencyRecord) -> IdempotencyRecord: ...
def get_idempotency_record(self, key: str) -> IdempotencyRecord | None: ...
def save_ingestion_job(self, job: IngestionJob) -> IngestionJob: ...
def get_ingestion_job(self, job_id: str) -> IngestionJob: ...
def list_ingestion_jobs(
self,
*,
status: IngestionJobStatus | None = None,
) -> list[IngestionJob]: ...

View File

@@ -1,5 +1,12 @@
"""Application services for the engine."""
from .asset_service import AssetChangeResult, AssetRegistryService, RelationshipChangeResult
from .ingestion_service import AssetIngestionResult, AssetIngestionService
__all__ = ["AssetChangeResult", "AssetRegistryService", "RelationshipChangeResult"]
__all__ = [
"AssetChangeResult",
"AssetIngestionResult",
"AssetIngestionService",
"AssetRegistryService",
"RelationshipChangeResult",
]

View File

@@ -0,0 +1,304 @@
"""Application service for governed asset ingestion."""
from __future__ import annotations
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
from kontextual_engine.adapters.builtin_extractors import PlainTextExtractor
from kontextual_engine.adapters.local_files import LocalFileConnector
from kontextual_engine.adapters.markitect_tool import MarkitectMarkdownExtractor
from kontextual_engine.core import (
AssetRepresentation,
Classification,
IngestionFailure,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
MetadataRecord,
OperationContext,
RepresentationKind,
Sensitivity,
SourcePayload,
mapping_digest,
)
from kontextual_engine.errors import AdapterUnavailableError, KontextualError
from kontextual_engine.ports import AssetRegistryRepository, DirectorySourceConnector, FormatExtractor, SourceConnector
from .asset_service import AssetChangeResult, AssetRegistryService
@dataclass(frozen=True)
class AssetIngestionResult:
job: IngestionJob
asset: KnowledgeAsset | None = None
asset_change: AssetChangeResult | None = None
class AssetIngestionService:
def __init__(
self,
repository: AssetRegistryRepository,
*,
asset_service: AssetRegistryService | None = None,
connectors: Iterable[SourceConnector] | None = None,
extractors: Iterable[FormatExtractor] | None = None,
) -> None:
self.repository = repository
self.asset_service = asset_service or AssetRegistryService(repository)
self.connectors = {connector.name: connector for connector in (connectors or [LocalFileConnector()])}
self.extractors = list(extractors or [PlainTextExtractor(), MarkitectMarkdownExtractor()])
def connector_capabilities(self) -> list[dict]:
return [connector.capabilities().to_dict() for connector in self.connectors.values()]
def extractor_capabilities(self) -> list[dict]:
return [extractor.capabilities().to_dict() for extractor in self.extractors]
def ingest_file(
self,
path: str | Path,
context: OperationContext,
*,
asset_id: str | None = None,
title: str | None = None,
classification: Classification | None = None,
idempotency_key: str | None = None,
) -> AssetIngestionResult:
connector = self._connector("local_file")
job = IngestionJob.create(
input={"connector": connector.name, "source_uri": str(path), "mode": "file"},
actor_id=context.actor.id,
correlation_id=context.correlation_id,
)
self.repository.save_ingestion_job(job)
try:
payload = connector.fetch(str(path))
return self._ingest_payload(
job,
payload,
context,
asset_id=asset_id,
title=title,
classification=classification,
idempotency_key=idempotency_key,
)
except Exception as exc:
failed = job.failed(_failure_from_exception(exc))
self.repository.save_ingestion_job(failed)
return AssetIngestionResult(failed)
def ingest_directory(
self,
path: str | Path,
context: OperationContext,
*,
recursive: bool = True,
classification: Classification | None = None,
) -> IngestionJob:
connector = self._directory_connector("local_file")
job = IngestionJob.create(
input={
"connector": connector.name,
"source_uri": str(path),
"mode": "directory",
"recursive": recursive,
},
actor_id=context.actor.id,
correlation_id=context.correlation_id,
)
job = job.running()
self.repository.save_ingestion_job(job)
output_asset_ids: list[str] = []
failures: list[IngestionFailure] = []
item_results: list[dict] = []
files = connector.iter_files(str(path), recursive=recursive)
for source_uri in files:
result = self.ingest_file(source_uri, context, classification=classification)
item = {
"source_uri": source_uri,
"job_id": result.job.job_id,
"status": result.job.status.value,
}
if result.asset is not None:
output_asset_ids.append(result.asset.id)
item["asset_id"] = result.asset.id
if result.job.failures:
failures.extend(result.job.failures)
item["failures"] = [failure.to_dict() for failure in result.job.failures]
item_results.append(item)
partial_results = {
"files_total": len(files),
"succeeded": sum(1 for item in item_results if item["status"] == IngestionJobStatus.COMPLETED.value),
"failed": sum(1 for item in item_results if item["status"] == IngestionJobStatus.FAILED.value),
"quarantined": sum(1 for item in item_results if item["status"] == IngestionJobStatus.QUARANTINED.value),
"skipped": 0,
"items": item_results,
}
if failures and output_asset_ids:
job = job.partially_completed(
output_asset_ids=tuple(output_asset_ids),
failures=tuple(failures),
partial_results=partial_results,
)
elif failures:
job = job.failed(
IngestionFailure(
code="ingestion.directory_failed",
message="Directory ingestion failed for all files",
retriable=True,
details=partial_results,
),
partial_results=partial_results,
)
else:
job = job.completed(output_asset_ids=tuple(output_asset_ids), partial_results=partial_results)
self.repository.save_ingestion_job(job)
return job
def get_job(self, job_id: str) -> IngestionJob:
return self.repository.get_ingestion_job(job_id)
def list_jobs(self, *, status: IngestionJobStatus | None = None) -> list[IngestionJob]:
return self.repository.list_ingestion_jobs(status=status)
def _ingest_payload(
self,
job: IngestionJob,
payload: SourcePayload,
context: OperationContext,
*,
asset_id: str | None,
title: str | None,
classification: Classification | None,
idempotency_key: str | None,
) -> AssetIngestionResult:
job = job.running(source_ref=payload.source_ref)
self.repository.save_ingestion_job(job)
extractor = self._extractor(payload.media_type)
extraction = extractor.extract(payload)
resolved_asset_id = asset_id or _stable_asset_id(payload)
source_representation = AssetRepresentation.from_content(
resolved_asset_id,
RepresentationKind.SOURCE,
payload.media_type,
payload.content,
storage_ref=payload.source_uri,
producer=payload.connector_name,
source_ref_id=payload.source_ref.id,
metadata={
"connector": payload.connector_name,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
**payload.metadata,
},
)
normalized_representation = AssetRepresentation.from_content(
resolved_asset_id,
RepresentationKind.NORMALIZED,
extraction.normalized.media_type,
extraction.normalized.to_json(),
producer=extractor.name,
source_ref_id=payload.source_ref.id,
metadata={
"extractor": extractor.name,
"normalized_hash": extraction.normalized.normalized_hash,
**extraction.metadata,
},
)
asset_change = self.asset_service.create_asset(
title or payload.title,
classification or Classification(asset_type="document", sensitivity=Sensitivity.INTERNAL),
context,
asset_id=resolved_asset_id,
source_refs=[payload.source_ref],
representations=[source_representation, normalized_representation],
metadata_records=_metadata_records(payload, extractor.name, extraction.metadata),
idempotency_key=idempotency_key,
)
completed = job.completed(
output_asset_ids=(asset_change.asset.id,),
partial_results={
"connector": payload.connector_name,
"extractor": extractor.name,
"source_digest": payload.content_digest,
"representations": [
source_representation.representation_id,
normalized_representation.representation_id,
],
"diagnostics": [diagnostic.to_dict() for diagnostic in extraction.diagnostics],
},
)
self.repository.save_ingestion_job(completed)
return AssetIngestionResult(completed, asset_change.asset, asset_change)
def _connector(self, name: str) -> SourceConnector:
try:
return self.connectors[name]
except KeyError as exc:
raise AdapterUnavailableError("Source connector is not registered", details={"connector": name}) from exc
def _directory_connector(self, name: str) -> DirectorySourceConnector:
connector = self._connector(name)
if not hasattr(connector, "iter_files"):
raise AdapterUnavailableError(
"Source connector does not support directory iteration",
details={"connector": name},
)
return connector # type: ignore[return-value]
def _extractor(self, media_type: str) -> FormatExtractor:
for extractor in self.extractors:
if extractor.supports(media_type):
return extractor
raise AdapterUnavailableError(
"No extractor registered for media type",
details={"media_type": media_type},
)
def _stable_asset_id(payload: SourcePayload) -> str:
digest = mapping_digest(
{
"source_system": payload.source_ref.source_system,
"path": payload.source_ref.path,
"uri": payload.source_ref.uri,
"external_id": payload.source_ref.external_id,
"connector_ref": payload.source_ref.connector_ref,
}
)
return f"asset-{digest.removeprefix('sha256:')[:20]}"
def _metadata_records(
payload: SourcePayload,
extractor_name: str,
extraction_metadata: dict,
) -> list[MetadataRecord]:
return [
MetadataRecord("source_media_type", payload.media_type, provenance={"producer": payload.connector_name}),
MetadataRecord("source_digest", payload.content_digest, provenance={"producer": payload.connector_name}),
MetadataRecord("source_size_bytes", payload.size_bytes, provenance={"producer": payload.connector_name}),
MetadataRecord("connector", payload.connector_name, provenance={"producer": payload.connector_name}, confirmed=True),
MetadataRecord("extractor", extractor_name, provenance={"producer": extractor_name}, confirmed=True),
MetadataRecord("extraction", dict(extraction_metadata), provenance={"producer": extractor_name}),
]
def _failure_from_exception(exc: Exception) -> IngestionFailure:
if isinstance(exc, KontextualError):
return IngestionFailure(
code=exc.code,
message=str(exc),
retriable=isinstance(exc, AdapterUnavailableError),
details=dict(exc.details),
)
return IngestionFailure(
code="ingestion.unexpected",
message=str(exc),
retriable=False,
details={"exception_type": type(exc).__name__},
)