first ingestion/normalization slice

This commit is contained in:
2026-05-06 02:35:40 +02:00
parent 286ebc3cb6
commit 565a5643a3
19 changed files with 1231 additions and 10 deletions

View File

@@ -0,0 +1,5 @@
"""Built-in baseline format extractors."""
from .text import PlainTextExtractor
__all__ = ["PlainTextExtractor"]

View File

@@ -0,0 +1,42 @@
"""Plain text normalization extractor."""
from __future__ import annotations
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
class PlainTextExtractor:
name = "plain-text"
media_types = ("text/plain",)
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="text",
produces_structure=False,
)
def supports(self, media_type: str) -> bool:
return media_type in self.media_types or media_type.startswith("text/plain")
def extract(self, payload: SourcePayload) -> ExtractionResult:
text = payload.read_text()
normalized = NormalizedDocument(
title=payload.title,
text=text,
fields={"line_count": len(text.splitlines())},
confidence=1.0,
extractor_metadata={
"extractor": self.name,
"source_media_type": payload.media_type,
},
)
return ExtractionResult(
normalized=normalized,
metadata={
"extractor": self.name,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
},
)

View File

@@ -0,0 +1,5 @@
"""Local filesystem ingestion connector."""
from .connector import LocalFileConnector
__all__ = ["LocalFileConnector"]

View File

@@ -0,0 +1,77 @@
"""Local file and directory source connector."""
from __future__ import annotations
import mimetypes
from pathlib import Path
from typing import Any
from kontextual_engine.core import ConnectorCapability, SourcePayload, SourceReference, content_digest
from kontextual_engine.errors import NotFoundError, ValidationError
class LocalFileConnector:
name = "local_file"
def capabilities(self) -> ConnectorCapability:
return ConnectorCapability(
connector_name=self.name,
source_types=("file", "directory"),
supports_directories=True,
metadata={"uri_schemes": ["file", "path"]},
)
def fetch(self, source_uri: str) -> SourcePayload:
path = Path(source_uri).expanduser()
if not path.exists():
raise NotFoundError("Local source file not found", details={"path": str(path)})
if not path.is_file():
raise ValidationError("Local source is not a file", details={"path": str(path)})
content = path.read_bytes()
media_type = _guess_media_type(path)
source_ref = SourceReference(
source_system=self.name,
path=str(path),
checksum=content_digest(content),
connector_ref=f"{self.name}:{path.resolve()}",
metadata=_file_metadata(path),
)
return SourcePayload(
connector_name=self.name,
source_uri=str(path),
source_ref=source_ref,
media_type=media_type,
content=content,
title=path.stem,
metadata={"filename": path.name, **_file_metadata(path)},
)
def iter_files(self, source_uri: str, *, recursive: bool = True) -> list[str]:
root = Path(source_uri).expanduser()
if not root.exists():
raise NotFoundError("Local source directory not found", details={"path": str(root)})
if root.is_file():
return [str(root)]
if not root.is_dir():
raise ValidationError("Local source is not a directory", details={"path": str(root)})
pattern = "**/*" if recursive else "*"
return sorted(str(path) for path in root.glob(pattern) if path.is_file())
def _guess_media_type(path: Path) -> str:
suffix = path.suffix.lower()
if suffix in {".md", ".markdown", ".mkd"}:
return "text/markdown"
if suffix in {".txt", ".text", ".log"}:
return "text/plain"
guessed, _ = mimetypes.guess_type(path.name)
return guessed or "application/octet-stream"
def _file_metadata(path: Path) -> dict[str, Any]:
stat = path.stat()
return {
"size_bytes": stat.st_size,
"mtime_ns": stat.st_mtime_ns,
}

View File

@@ -0,0 +1,5 @@
"""markitect-tool ingestion adapter boundary."""
from .markdown import MarkitectMarkdownExtractor
__all__ = ["MarkitectMarkdownExtractor"]

View File

@@ -0,0 +1,86 @@
"""Markdown normalization through markitect-tool."""
from __future__ import annotations
from pathlib import Path
from typing import Any
from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
from kontextual_engine.errors import AdapterUnavailableError
class MarkitectMarkdownExtractor:
"""Adapter boundary to markitect-tool; Markdown syntax logic stays external."""
name = "markitect-tool"
media_types = ("text/markdown", "text/x-markdown")
def capabilities(self) -> ExtractorCapability:
return ExtractorCapability(
extractor_name=self.name,
media_types=self.media_types,
extraction_depth="structure",
produces_structure=True,
optional_dependency="markitect-tool",
metadata={"delegates_markdown_syntax": True},
)
def supports(self, media_type: str) -> bool:
return media_type in self.media_types
def extract(self, payload: SourcePayload) -> ExtractionResult:
try:
import markitect_tool as mkt
except Exception as exc: # pragma: no cover - depends on optional environment
raise AdapterUnavailableError(
"markitect-tool is required for markdown normalization",
details={"adapter": self.name, "media_type": payload.media_type},
) from exc
source_path = payload.source_ref.path
text = payload.read_text()
document = self._parse_document(mkt, text, source_path)
serialized = document.to_dict() if hasattr(document, "to_dict") else {}
snapshot = self._snapshot(mkt, source_path)
structure = {
"frontmatter": dict(serialized.get("frontmatter", {})),
"headings": list(serialized.get("headings", [])),
"sections": list(serialized.get("sections", [])),
}
normalized = NormalizedDocument(
title=payload.title,
text=text,
structure=structure,
fields={
"frontmatter": dict(serialized.get("frontmatter", {})),
"heading_count": len(structure["headings"]),
"section_count": len(structure["sections"]),
},
confidence=1.0,
extractor_metadata={
"extractor": self.name,
"source_media_type": payload.media_type,
"snapshot": snapshot,
},
)
return ExtractionResult(
normalized=normalized,
metadata={
"extractor": self.name,
"frontmatter": structure["frontmatter"],
"headings": structure["headings"],
"snapshot": snapshot,
"source_digest": payload.content_digest,
"source_size_bytes": payload.size_bytes,
},
)
def _parse_document(self, mkt: Any, text: str, source_path: str | None) -> Any:
if source_path and Path(source_path).exists() and hasattr(mkt, "parse_markdown_file"):
return mkt.parse_markdown_file(Path(source_path))
return mkt.parse_markdown(text, source_path=source_path)
def _snapshot(self, mkt: Any, source_path: str | None) -> dict[str, Any]:
if not source_path or not Path(source_path).exists() or not hasattr(mkt, "snapshot_identity_for_file"):
return {}
return mkt.snapshot_identity_for_file(Path(source_path), parse_options={"profile": "default"}).to_dict()

View File

@@ -13,6 +13,8 @@ from kontextual_engine.core import (
ContextEntity,
CoreRelationship,
IdempotencyRecord,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
LifecycleState,
MetadataRecord,
@@ -32,6 +34,7 @@ class InMemoryAssetRegistryRepository:
versions: dict[str, list[AssetVersion]] = field(default_factory=dict)
audit_events: dict[str, AuditEvent] = field(default_factory=dict)
idempotency_records: dict[str, IdempotencyRecord] = field(default_factory=dict)
ingestion_jobs: dict[str, IngestionJob] = field(default_factory=dict)
def save_actor(self, actor: Actor) -> Actor:
self.actors[actor.id] = actor
@@ -190,3 +193,23 @@ class InMemoryAssetRegistryRepository:
def get_idempotency_record(self, key: str) -> IdempotencyRecord | None:
return self.idempotency_records.get(key)
def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
self.ingestion_jobs[job.job_id] = job
return job
def get_ingestion_job(self, job_id: str) -> IngestionJob:
try:
return self.ingestion_jobs[job_id]
except KeyError as exc:
raise NotFoundError("Ingestion job not found", details={"job_id": job_id}) from exc
def list_ingestion_jobs(
self,
*,
status: IngestionJobStatus | None = None,
) -> list[IngestionJob]:
jobs: Iterable[IngestionJob] = self.ingestion_jobs.values()
if status is not None:
jobs = [job for job in jobs if job.status == status]
return sorted(jobs, key=lambda job: (job.created_at, job.job_id))

View File

@@ -15,6 +15,8 @@ from kontextual_engine.core import (
ContextEntity,
CoreRelationship,
IdempotencyRecord,
IngestionJob,
IngestionJobStatus,
KnowledgeAsset,
LifecycleState,
MetadataRecord,
@@ -381,6 +383,51 @@ class SQLiteAssetRegistryRepository:
return None
return IdempotencyRecord.from_dict(_loads(row["payload"]))
def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
with self._connect() as conn:
conn.execute(
"""
insert into ingestion_jobs (id, status, actor_id, correlation_id, created_at, updated_at, payload)
values (?, ?, ?, ?, ?, ?, ?)
on conflict(id) do update set
status=excluded.status,
actor_id=excluded.actor_id,
correlation_id=excluded.correlation_id,
updated_at=excluded.updated_at,
payload=excluded.payload
""",
(
job.job_id,
job.status.value,
job.actor_id,
job.correlation_id,
job.created_at,
job.updated_at,
_json(job.to_dict()),
),
)
return job
def get_ingestion_job(self, job_id: str) -> IngestionJob:
row = self._one("select payload from ingestion_jobs where id = ?", (job_id,))
if row is None:
raise NotFoundError("Ingestion job not found", details={"job_id": job_id})
return IngestionJob.from_dict(_loads(row["payload"]))
def list_ingestion_jobs(
self,
*,
status: IngestionJobStatus | None = None,
) -> list[IngestionJob]:
if status is None:
rows = self._all("select payload from ingestion_jobs order by created_at, id", ())
else:
rows = self._all(
"select payload from ingestion_jobs where status = ? order by created_at, id",
(status.value,),
)
return [IngestionJob.from_dict(_loads(row["payload"])) for row in rows]
def _initialize(self) -> None:
with self._connect() as conn:
conn.executescript(
@@ -449,6 +496,15 @@ class SQLiteAssetRegistryRepository:
status text not null,
payload text not null
);
create table if not exists ingestion_jobs (
id text primary key,
status text not null,
actor_id text not null,
correlation_id text not null,
created_at text not null,
updated_at text not null,
payload text not null
);
create index if not exists idx_assets_lifecycle on assets(lifecycle);
create index if not exists idx_representations_asset on representations(asset_id);
create index if not exists idx_metadata_asset on metadata_records(asset_id);
@@ -458,6 +514,8 @@ class SQLiteAssetRegistryRepository:
create index if not exists idx_versions_asset on asset_versions(asset_id);
create index if not exists idx_audit_target on audit_events(target);
create index if not exists idx_audit_correlation on audit_events(correlation_id);
create index if not exists idx_ingestion_jobs_status on ingestion_jobs(status);
create index if not exists idx_ingestion_jobs_correlation on ingestion_jobs(correlation_id);
"""
)