kontextual-engine/src/kontextual_engine/ingestion.py

"""Format-agnostic ingestion contracts and first adapters."""

from __future__ import annotations

from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Protocol

from .artifacts import Artifact, ArtifactMetadata, ArtifactType
from .errors import AdapterUnavailableError


@dataclass
class IngestionRequest:
    collection_id: str
    name: str
    content: str | bytes | None = None
    path: str | Path | None = None
    media_type: str = "text/plain"
    artifact_type: ArtifactType = ArtifactType.CONTENT
    metadata: dict[str, Any] = field(default_factory=dict)

    def read_text(self) -> str:
        if self.content is not None:
            if isinstance(self.content, bytes):
                return self.content.decode("utf-8")
            return self.content
        if self.path is None:
            return ""
        return Path(self.path).read_text(encoding="utf-8")


@dataclass
class IngestionResult:
    artifacts: list[Artifact]
    normalized: dict[str, Any] = field(default_factory=dict)
    adapter: str = ""


class IngestionAdapter(Protocol):
    media_types: tuple[str, ...]

    def ingest(self, request: IngestionRequest) -> IngestionResult: ...


class PlainTextIngestionAdapter:
    media_types = ("text/plain",)

    def ingest(self, request: IngestionRequest) -> IngestionResult:
        text = request.read_text()
        metadata = ArtifactMetadata(
            media_type=request.media_type,
            source_uri=str(request.path) if request.path else None,
            custom=dict(request.metadata),
        )
        artifact = Artifact.create(
            request.collection_id,
            request.name,
            text,
            artifact_type=request.artifact_type,
            metadata=metadata,
        )
        return IngestionResult(
            artifacts=[artifact],
            normalized={"text": text, "media_type": request.media_type},
            adapter="plain-text",
        )


class MarkdownIngestionAdapter:
    """Adapter boundary to markitect-tool; no markdown parsing lives here."""

    media_types = ("text/markdown", "text/x-markdown")

    def ingest(self, request: IngestionRequest) -> IngestionResult:
        try:
            from markitect_tool import parse_markdown
        except Exception as exc:  # pragma: no cover - exercised when optional dep absent
            raise AdapterUnavailableError(
                "markitect-tool is required for markdown ingestion",
                details={"adapter": "markitect-tool", "media_type": request.media_type},
            ) from exc

        text = request.read_text()
        document = parse_markdown(text, source_path=str(request.path) if request.path else None)
        metadata = ArtifactMetadata(
            media_type=request.media_type,
            source_uri=str(request.path) if request.path else None,
            custom={
                **request.metadata,
                "frontmatter": dict(document.frontmatter),
                "headings": [heading.__dict__ for heading in document.headings],
            },
        )
        artifact = Artifact.create(
            request.collection_id,
            request.name,
            text,
            artifact_type=request.artifact_type,
            metadata=metadata,
        )
        return IngestionResult(
            artifacts=[artifact],
            normalized={
                "frontmatter": dict(document.frontmatter),
                "headings": [heading.__dict__ for heading in document.headings],
                "sections": len(document.sections),
            },
            adapter="markitect-tool",
        )


class IngestionService:
    def __init__(self, adapters: list[IngestionAdapter] | None = None) -> None:
        self.adapters = adapters or [PlainTextIngestionAdapter(), MarkdownIngestionAdapter()]

    def ingest(self, request: IngestionRequest) -> IngestionResult:
        for adapter in self.adapters:
            if request.media_type in adapter.media_types:
                return adapter.ingest(request)
        raise AdapterUnavailableError(
            "No ingestion adapter registered for media type",
            details={"media_type": request.media_type},
        )