first ingestion/normalization slice

2026-05-06 02:35:40 +02:00
parent 286ebc3cb6
commit 565a5643a3
19 changed files with 1231 additions and 10 deletions
--- a/src/kontextual_engine/adapters/builtin_extractors/init.py
+++ b/src/kontextual_engine/adapters/builtin_extractors/init.py
@@ -0,0 +1,5 @@
+"""Built-in baseline format extractors."""
+
+from .text import PlainTextExtractor
+
+__all__ = ["PlainTextExtractor"]
--- a/src/kontextual_engine/adapters/builtin_extractors/text.py
+++ b/src/kontextual_engine/adapters/builtin_extractors/text.py
@@ -0,0 +1,42 @@
+"""Plain text normalization extractor."""
+
+from __future__ import annotations
+
+from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
+
+
+class PlainTextExtractor:
+    name = "plain-text"
+    media_types = ("text/plain",)
+
+    def capabilities(self) -> ExtractorCapability:
+        return ExtractorCapability(
+            extractor_name=self.name,
+            media_types=self.media_types,
+            extraction_depth="text",
+            produces_structure=False,
+        )
+
+    def supports(self, media_type: str) -> bool:
+        return media_type in self.media_types or media_type.startswith("text/plain")
+
+    def extract(self, payload: SourcePayload) -> ExtractionResult:
+        text = payload.read_text()
+        normalized = NormalizedDocument(
+            title=payload.title,
+            text=text,
+            fields={"line_count": len(text.splitlines())},
+            confidence=1.0,
+            extractor_metadata={
+                "extractor": self.name,
+                "source_media_type": payload.media_type,
+            },
+        )
+        return ExtractionResult(
+            normalized=normalized,
+            metadata={
+                "extractor": self.name,
+                "source_digest": payload.content_digest,
+                "source_size_bytes": payload.size_bytes,
+            },
+        )
--- a/src/kontextual_engine/adapters/local_files/init.py
+++ b/src/kontextual_engine/adapters/local_files/init.py
@@ -0,0 +1,5 @@
+"""Local filesystem ingestion connector."""
+
+from .connector import LocalFileConnector
+
+__all__ = ["LocalFileConnector"]
--- a/src/kontextual_engine/adapters/local_files/connector.py
+++ b/src/kontextual_engine/adapters/local_files/connector.py
@@ -0,0 +1,77 @@
+"""Local file and directory source connector."""
+
+from __future__ import annotations
+
+import mimetypes
+from pathlib import Path
+from typing import Any
+
+from kontextual_engine.core import ConnectorCapability, SourcePayload, SourceReference, content_digest
+from kontextual_engine.errors import NotFoundError, ValidationError
+
+
+class LocalFileConnector:
+    name = "local_file"
+
+    def capabilities(self) -> ConnectorCapability:
+        return ConnectorCapability(
+            connector_name=self.name,
+            source_types=("file", "directory"),
+            supports_directories=True,
+            metadata={"uri_schemes": ["file", "path"]},
+        )
+
+    def fetch(self, source_uri: str) -> SourcePayload:
+        path = Path(source_uri).expanduser()
+        if not path.exists():
+            raise NotFoundError("Local source file not found", details={"path": str(path)})
+        if not path.is_file():
+            raise ValidationError("Local source is not a file", details={"path": str(path)})
+
+        content = path.read_bytes()
+        media_type = _guess_media_type(path)
+        source_ref = SourceReference(
+            source_system=self.name,
+            path=str(path),
+            checksum=content_digest(content),
+            connector_ref=f"{self.name}:{path.resolve()}",
+            metadata=_file_metadata(path),
+        )
+        return SourcePayload(
+            connector_name=self.name,
+            source_uri=str(path),
+            source_ref=source_ref,
+            media_type=media_type,
+            content=content,
+            title=path.stem,
+            metadata={"filename": path.name, **_file_metadata(path)},
+        )
+
+    def iter_files(self, source_uri: str, *, recursive: bool = True) -> list[str]:
+        root = Path(source_uri).expanduser()
+        if not root.exists():
+            raise NotFoundError("Local source directory not found", details={"path": str(root)})
+        if root.is_file():
+            return [str(root)]
+        if not root.is_dir():
+            raise ValidationError("Local source is not a directory", details={"path": str(root)})
+        pattern = "**/*" if recursive else "*"
+        return sorted(str(path) for path in root.glob(pattern) if path.is_file())
+
+
+def _guess_media_type(path: Path) -> str:
+    suffix = path.suffix.lower()
+    if suffix in {".md", ".markdown", ".mkd"}:
+        return "text/markdown"
+    if suffix in {".txt", ".text", ".log"}:
+        return "text/plain"
+    guessed, _ = mimetypes.guess_type(path.name)
+    return guessed or "application/octet-stream"
+
+
+def _file_metadata(path: Path) -> dict[str, Any]:
+    stat = path.stat()
+    return {
+        "size_bytes": stat.st_size,
+        "mtime_ns": stat.st_mtime_ns,
+    }
--- a/src/kontextual_engine/adapters/markitect_tool/init.py
+++ b/src/kontextual_engine/adapters/markitect_tool/init.py
@@ -0,0 +1,5 @@
+"""markitect-tool ingestion adapter boundary."""
+
+from .markdown import MarkitectMarkdownExtractor
+
+__all__ = ["MarkitectMarkdownExtractor"]
--- a/src/kontextual_engine/adapters/markitect_tool/markdown.py
+++ b/src/kontextual_engine/adapters/markitect_tool/markdown.py
@@ -0,0 +1,86 @@
+"""Markdown normalization through markitect-tool."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+from kontextual_engine.core import ExtractionResult, ExtractorCapability, NormalizedDocument, SourcePayload
+from kontextual_engine.errors import AdapterUnavailableError
+
+
+class MarkitectMarkdownExtractor:
+    """Adapter boundary to markitect-tool; Markdown syntax logic stays external."""
+
+    name = "markitect-tool"
+    media_types = ("text/markdown", "text/x-markdown")
+
+    def capabilities(self) -> ExtractorCapability:
+        return ExtractorCapability(
+            extractor_name=self.name,
+            media_types=self.media_types,
+            extraction_depth="structure",
+            produces_structure=True,
+            optional_dependency="markitect-tool",
+            metadata={"delegates_markdown_syntax": True},
+        )
+
+    def supports(self, media_type: str) -> bool:
+        return media_type in self.media_types
+
+    def extract(self, payload: SourcePayload) -> ExtractionResult:
+        try:
+            import markitect_tool as mkt
+        except Exception as exc:  # pragma: no cover - depends on optional environment
+            raise AdapterUnavailableError(
+                "markitect-tool is required for markdown normalization",
+                details={"adapter": self.name, "media_type": payload.media_type},
+            ) from exc
+
+        source_path = payload.source_ref.path
+        text = payload.read_text()
+        document = self._parse_document(mkt, text, source_path)
+        serialized = document.to_dict() if hasattr(document, "to_dict") else {}
+        snapshot = self._snapshot(mkt, source_path)
+        structure = {
+            "frontmatter": dict(serialized.get("frontmatter", {})),
+            "headings": list(serialized.get("headings", [])),
+            "sections": list(serialized.get("sections", [])),
+        }
+        normalized = NormalizedDocument(
+            title=payload.title,
+            text=text,
+            structure=structure,
+            fields={
+                "frontmatter": dict(serialized.get("frontmatter", {})),
+                "heading_count": len(structure["headings"]),
+                "section_count": len(structure["sections"]),
+            },
+            confidence=1.0,
+            extractor_metadata={
+                "extractor": self.name,
+                "source_media_type": payload.media_type,
+                "snapshot": snapshot,
+            },
+        )
+        return ExtractionResult(
+            normalized=normalized,
+            metadata={
+                "extractor": self.name,
+                "frontmatter": structure["frontmatter"],
+                "headings": structure["headings"],
+                "snapshot": snapshot,
+                "source_digest": payload.content_digest,
+                "source_size_bytes": payload.size_bytes,
+            },
+        )
+
+    def _parse_document(self, mkt: Any, text: str, source_path: str | None) -> Any:
+        if source_path and Path(source_path).exists() and hasattr(mkt, "parse_markdown_file"):
+            return mkt.parse_markdown_file(Path(source_path))
+        return mkt.parse_markdown(text, source_path=source_path)
+
+    def _snapshot(self, mkt: Any, source_path: str | None) -> dict[str, Any]:
+        if not source_path or not Path(source_path).exists() or not hasattr(mkt, "snapshot_identity_for_file"):
+            return {}
+        return mkt.snapshot_identity_for_file(Path(source_path), parse_options={"profile": "default"}).to_dict()
--- a/src/kontextual_engine/adapters/memory/asset_registry.py
+++ b/src/kontextual_engine/adapters/memory/asset_registry.py
@@ -13,6 +13,8 @@ from kontextual_engine.core import (
    ContextEntity,
    CoreRelationship,
    IdempotencyRecord,
+    IngestionJob,
+    IngestionJobStatus,
    KnowledgeAsset,
    LifecycleState,
    MetadataRecord,
@@ -32,6 +34,7 @@ class InMemoryAssetRegistryRepository:
    versions: dict[str, list[AssetVersion]] = field(default_factory=dict)
    audit_events: dict[str, AuditEvent] = field(default_factory=dict)
    idempotency_records: dict[str, IdempotencyRecord] = field(default_factory=dict)
+    ingestion_jobs: dict[str, IngestionJob] = field(default_factory=dict)

    def save_actor(self, actor: Actor) -> Actor:
        self.actors[actor.id] = actor
@@ -190,3 +193,23 @@ class InMemoryAssetRegistryRepository:

    def get_idempotency_record(self, key: str) -> IdempotencyRecord | None:
        return self.idempotency_records.get(key)
+
+    def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
+        self.ingestion_jobs[job.job_id] = job
+        return job
+
+    def get_ingestion_job(self, job_id: str) -> IngestionJob:
+        try:
+            return self.ingestion_jobs[job_id]
+        except KeyError as exc:
+            raise NotFoundError("Ingestion job not found", details={"job_id": job_id}) from exc
+
+    def list_ingestion_jobs(
+        self,
+        *,
+        status: IngestionJobStatus | None = None,
+    ) -> list[IngestionJob]:
+        jobs: Iterable[IngestionJob] = self.ingestion_jobs.values()
+        if status is not None:
+            jobs = [job for job in jobs if job.status == status]
+        return sorted(jobs, key=lambda job: (job.created_at, job.job_id))
--- a/src/kontextual_engine/adapters/sqlite/asset_registry.py
+++ b/src/kontextual_engine/adapters/sqlite/asset_registry.py
@@ -15,6 +15,8 @@ from kontextual_engine.core import (
    ContextEntity,
    CoreRelationship,
    IdempotencyRecord,
+    IngestionJob,
+    IngestionJobStatus,
    KnowledgeAsset,
    LifecycleState,
    MetadataRecord,
@@ -381,6 +383,51 @@ class SQLiteAssetRegistryRepository:
            return None
        return IdempotencyRecord.from_dict(_loads(row["payload"]))

+    def save_ingestion_job(self, job: IngestionJob) -> IngestionJob:
+        with self._connect() as conn:
+            conn.execute(
+                """
+                insert into ingestion_jobs (id, status, actor_id, correlation_id, created_at, updated_at, payload)
+                values (?, ?, ?, ?, ?, ?, ?)
+                on conflict(id) do update set
+                  status=excluded.status,
+                  actor_id=excluded.actor_id,
+                  correlation_id=excluded.correlation_id,
+                  updated_at=excluded.updated_at,
+                  payload=excluded.payload
+                """,
+                (
+                    job.job_id,
+                    job.status.value,
+                    job.actor_id,
+                    job.correlation_id,
+                    job.created_at,
+                    job.updated_at,
+                    _json(job.to_dict()),
+                ),
+            )
+        return job
+
+    def get_ingestion_job(self, job_id: str) -> IngestionJob:
+        row = self._one("select payload from ingestion_jobs where id = ?", (job_id,))
+        if row is None:
+            raise NotFoundError("Ingestion job not found", details={"job_id": job_id})
+        return IngestionJob.from_dict(_loads(row["payload"]))
+
+    def list_ingestion_jobs(
+        self,
+        *,
+        status: IngestionJobStatus | None = None,
+    ) -> list[IngestionJob]:
+        if status is None:
+            rows = self._all("select payload from ingestion_jobs order by created_at, id", ())
+        else:
+            rows = self._all(
+                "select payload from ingestion_jobs where status = ? order by created_at, id",
+                (status.value,),
+            )
+        return [IngestionJob.from_dict(_loads(row["payload"])) for row in rows]
+
    def _initialize(self) -> None:
        with self._connect() as conn:
            conn.executescript(
@@ -449,6 +496,15 @@ class SQLiteAssetRegistryRepository:
                  status text not null,
                  payload text not null
                );
+                create table if not exists ingestion_jobs (
+                  id text primary key,
+                  status text not null,
+                  actor_id text not null,
+                  correlation_id text not null,
+                  created_at text not null,
+                  updated_at text not null,
+                  payload text not null
+                );
                create index if not exists idx_assets_lifecycle on assets(lifecycle);
                create index if not exists idx_representations_asset on representations(asset_id);
                create index if not exists idx_metadata_asset on metadata_records(asset_id);
@@ -458,6 +514,8 @@ class SQLiteAssetRegistryRepository:
                create index if not exists idx_versions_asset on asset_versions(asset_id);
                create index if not exists idx_audit_target on audit_events(target);
                create index if not exists idx_audit_correlation on audit_events(correlation_id);
+                create index if not exists idx_ingestion_jobs_status on ingestion_jobs(status);
+                create index if not exists idx_ingestion_jobs_correlation on ingestion_jobs(correlation_id);
                """
            )