markitect-tool/src/markitect_tool/processor/engine.py

"""Processor API for deterministic fenced-block workflows."""

from __future__ import annotations

import hashlib
import re
import shlex
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Callable

from markdown_it import MarkdownIt

from markitect_tool.diagnostics import Diagnostic, SourceLocation
from markitect_tool.ops import OperationProvenance
from markitect_tool.reference import (
    ReferenceContext,
    ReferenceResolutionError,
    resolve_reference,
)


ProcessorCallable = Callable[["ProcessorRequest"], "ProcessorResult"]


@dataclass(frozen=True)
class FencedProcessorBlock:
    """A fenced Markdown block that opted into processor handling."""

    processor: str
    content: str
    unit_id: str
    attrs: dict[str, str]
    language: str | None = None
    source_path: str | None = None
    line_start: int | None = None
    line_end: int | None = None
    content_hash: str = ""

    def to_dict(self) -> dict[str, Any]:
        return {key: value for key, value in asdict(self).items() if value not in (None, {}, "")}


@dataclass(frozen=True)
class ProcessorContext:
    """Execution context passed to deterministic processors."""

    root: Path = Path(".")
    current_path: Path | None = None
    namespaces: dict[str, str] = field(default_factory=dict)
    variables: dict[str, Any] = field(default_factory=dict)
    policy: dict[str, Any] = field(default_factory=dict)

    def reference_context(self) -> ReferenceContext:
        return ReferenceContext(
            root=self.root,
            current_path=self.current_path,
            namespaces=self.namespaces,
        )

    def to_dict(self) -> dict[str, Any]:
        data = {
            "root": str(self.root),
            "current_path": str(self.current_path) if self.current_path else None,
            "namespaces": self.namespaces,
            "variables": self.variables,
            "policy": self.policy,
        }
        return {key: value for key, value in data.items() if value not in (None, {}, "")}


@dataclass(frozen=True)
class ProcessorRequest:
    """One processor invocation."""

    block: FencedProcessorBlock
    context: ProcessorContext


@dataclass(frozen=True)
class ProcessorOutputFile:
    """A generated file requested by a processor."""

    path: str
    content: str

    def to_dict(self) -> dict[str, Any]:
        return asdict(self)


@dataclass(frozen=True)
class ProcessorResult:
    """Deterministic processor result envelope."""

    content: str | None = None
    files: list[ProcessorOutputFile] = field(default_factory=list)
    diagnostics: list[Diagnostic] = field(default_factory=list)
    dependencies: list[str] = field(default_factory=list)
    provenance: list[OperationProvenance] = field(default_factory=list)

    @property
    def valid(self) -> bool:
        return not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)

    def to_dict(self) -> dict[str, Any]:
        data = {
            "valid": self.valid,
            "content": self.content,
            "files": [file.to_dict() for file in self.files],
            "diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
            "dependencies": self.dependencies,
            "provenance": [event.to_dict() for event in self.provenance],
        }
        return {key: value for key, value in data.items() if value not in (None, [], {})}


@dataclass(frozen=True)
class ProcessorRun:
    """Results from running all processor blocks in a document."""

    source_path: str | None
    blocks: list[FencedProcessorBlock]
    results: list[ProcessorResult]

    @property
    def valid(self) -> bool:
        return all(result.valid for result in self.results)

    def to_dict(self) -> dict[str, Any]:
        return {
            "valid": self.valid,
            "source_path": self.source_path,
            "count": len(self.results),
            "blocks": [block.to_dict() for block in self.blocks],
            "results": [result.to_dict() for result in self.results],
        }


class ProcessorRegistry:
    """Explicit registry for deterministic fenced-block processors."""

    def __init__(self) -> None:
        self._processors: dict[str, ProcessorCallable] = {}

    def register(self, name: str, processor: ProcessorCallable) -> None:
        key = _slug(name)
        if not key:
            raise ValueError("Processor name cannot be empty")
        self._processors[key] = processor

    def names(self) -> list[str]:
        return sorted(self._processors)

    def run(self, request: ProcessorRequest) -> ProcessorResult:
        processor = self._processors.get(_slug(request.block.processor))
        if processor is None:
            return ProcessorResult(
                diagnostics=[
                    Diagnostic(
                        severity="error",
                        code="processor.unknown",
                        message=f"Unknown processor `{request.block.processor}`",
                        source=SourceLocation(
                            path=request.block.source_path,
                            line=request.block.line_start,
                        ),
                    )
                ]
            )
        return processor(request)


def default_processor_registry() -> ProcessorRegistry:
    """Create the default deterministic processor registry."""

    registry = ProcessorRegistry()
    registry.register("identity", _identity_processor)
    registry.register("uppercase", _uppercase_processor)
    registry.register("include", _include_processor)
    return registry


def discover_fenced_processors(
    markdown: str,
    *,
    source_path: str | Path | None = None,
) -> list[FencedProcessorBlock]:
    """Discover fenced blocks that explicitly opt into processor handling."""

    parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
    blocks: list[FencedProcessorBlock] = []
    used_ids: dict[str, int] = {}
    for index, token in enumerate(parser.parse(markdown)):
        if token.type != "fence":
            continue
        attrs = _parse_fence_info(token.info)
        processor = _processor_name(attrs)
        if not processor:
            continue
        unit_id = _dedupe_id(_slug(attrs.get("id") or f"{processor}-{index}"), used_ids)
        line_start = token.map[0] + 1 if token.map else None
        line_end = token.map[1] if token.map else None
        blocks.append(
            FencedProcessorBlock(
                processor=processor,
                content=token.content,
                unit_id=unit_id,
                attrs={
                    key: value
                    for key, value in attrs.items()
                    if key not in {"id", "language", "processor"}
                },
                language=attrs.get("language"),
                source_path=str(source_path) if source_path else None,
                line_start=line_start,
                line_end=line_end,
                content_hash=_hash_text(token.content),
            )
        )
    return blocks


def run_fenced_processors(
    markdown: str,
    *,
    context: ProcessorContext,
    registry: ProcessorRegistry | None = None,
    source_path: str | Path | None = None,
) -> ProcessorRun:
    """Run all processor-marked fenced blocks in document order."""

    active_registry = registry or default_processor_registry()
    blocks = discover_fenced_processors(markdown, source_path=source_path or context.current_path)
    results = [
        active_registry.run(ProcessorRequest(block=block, context=context))
        for block in blocks
    ]
    return ProcessorRun(
        source_path=str(source_path or context.current_path) if source_path or context.current_path else None,
        blocks=blocks,
        results=results,
    )


def _identity_processor(request: ProcessorRequest) -> ProcessorResult:
    return ProcessorResult(
        content=request.block.content,
        provenance=[
            OperationProvenance(
                operation="processor.identity",
                source_path=request.block.source_path,
                line_start=request.block.line_start,
                line_end=request.block.line_end,
                metadata={"unit_id": request.block.unit_id},
            )
        ],
    )


def _uppercase_processor(request: ProcessorRequest) -> ProcessorResult:
    return ProcessorResult(
        content=request.block.content.upper(),
        provenance=[
            OperationProvenance(
                operation="processor.uppercase",
                source_path=request.block.source_path,
                line_start=request.block.line_start,
                line_end=request.block.line_end,
                metadata={"unit_id": request.block.unit_id},
            )
        ],
    )


def _include_processor(request: ProcessorRequest) -> ProcessorResult:
    reference = request.block.attrs.get("ref")
    if not reference:
        return ProcessorResult(
            diagnostics=[
                Diagnostic(
                    severity="error",
                    code="processor.include.missing_ref",
                    message="Include processor requires a `ref` attribute",
                    source=SourceLocation(
                        path=request.block.source_path,
                        line=request.block.line_start,
                    ),
                )
            ]
        )
    try:
        resolution = resolve_reference(reference, context=request.context.reference_context())
    except ReferenceResolutionError as exc:
        return ProcessorResult(
            diagnostics=[
                Diagnostic(
                    severity="error",
                    code="processor.include.reference_error",
                    message=str(exc),
                    source=SourceLocation(
                        path=request.block.source_path,
                        line=request.block.line_start,
                    ),
                )
            ]
        )
    content = "\n\n".join(unit.text for unit in resolution.units)
    return ProcessorResult(
        content=content,
        dependencies=[resolution.target_path],
        provenance=[
            OperationProvenance(
                operation="processor.include",
                source_path=request.block.source_path,
                line_start=request.block.line_start,
                line_end=request.block.line_end,
                target_path=resolution.target_path,
                dependencies=[resolution.target_path],
                metadata={"ref": reference, "unit_ids": [unit.unit_id for unit in resolution.units]},
            )
        ],
    )


def _processor_name(attrs: dict[str, str]) -> str | None:
    if "processor" in attrs:
        return attrs["processor"]
    language = attrs.get("language", "")
    if language.startswith("mkt-"):
        return language.removeprefix("mkt-")
    if language == "mkt" and "type" in attrs:
        return attrs["type"]
    return None


def _parse_fence_info(info: str) -> dict[str, str]:
    match = re.match(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$", info.strip())
    if not match:
        return {"language": info.strip()} if info.strip() else {}
    attrs = _parse_attrs(match.group("attrs") or "")
    language = match.group("language")
    if language:
        attrs["language"] = language
    return attrs


def _parse_attrs(raw: str) -> dict[str, str]:
    attrs: dict[str, str] = {}
    for part in shlex.split(raw):
        if part.startswith("#") and len(part) > 1:
            attrs["id"] = part[1:]
            continue
        if "=" not in part:
            attrs[part] = "true"
            continue
        key, value = part.split("=", 1)
        attrs[key.strip()] = value.strip()
    return attrs


def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
    count = used_ids.get(unit_id, 0) + 1
    used_ids[unit_id] = count
    return unit_id if count == 1 else f"{unit_id}-{count}"


def _slug(value: str) -> str:
    slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
    slug = re.sub(r"-+", "-", slug).strip("-")
    return slug


def _hash_text(text: str) -> str:
    return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()