"""Processor API for deterministic fenced-block workflows.""" from __future__ import annotations import hashlib import re import shlex from dataclasses import asdict, dataclass, field from pathlib import Path from typing import Any, Callable from markdown_it import MarkdownIt from markitect_tool.diagnostics import Diagnostic, SourceLocation from markitect_tool.ops import OperationProvenance from markitect_tool.reference import ( ReferenceContext, ReferenceResolutionError, resolve_reference, ) ProcessorCallable = Callable[["ProcessorRequest"], "ProcessorResult"] @dataclass(frozen=True) class FencedProcessorBlock: """A fenced Markdown block that opted into processor handling.""" processor: str content: str unit_id: str attrs: dict[str, str] language: str | None = None source_path: str | None = None line_start: int | None = None line_end: int | None = None content_hash: str = "" def to_dict(self) -> dict[str, Any]: return {key: value for key, value in asdict(self).items() if value not in (None, {}, "")} @dataclass(frozen=True) class ProcessorContext: """Execution context passed to deterministic processors.""" root: Path = Path(".") current_path: Path | None = None namespaces: dict[str, str] = field(default_factory=dict) variables: dict[str, Any] = field(default_factory=dict) policy: dict[str, Any] = field(default_factory=dict) def reference_context(self) -> ReferenceContext: return ReferenceContext( root=self.root, current_path=self.current_path, namespaces=self.namespaces, ) def to_dict(self) -> dict[str, Any]: data = { "root": str(self.root), "current_path": str(self.current_path) if self.current_path else None, "namespaces": self.namespaces, "variables": self.variables, "policy": self.policy, } return {key: value for key, value in data.items() if value not in (None, {}, "")} @dataclass(frozen=True) class ProcessorRequest: """One processor invocation.""" block: FencedProcessorBlock context: ProcessorContext @dataclass(frozen=True) class ProcessorOutputFile: """A generated file requested by a processor.""" path: str content: str def to_dict(self) -> dict[str, Any]: return asdict(self) @dataclass(frozen=True) class ProcessorResult: """Deterministic processor result envelope.""" content: str | None = None files: list[ProcessorOutputFile] = field(default_factory=list) diagnostics: list[Diagnostic] = field(default_factory=list) dependencies: list[str] = field(default_factory=list) provenance: list[OperationProvenance] = field(default_factory=list) @property def valid(self) -> bool: return not any(diagnostic.severity == "error" for diagnostic in self.diagnostics) def to_dict(self) -> dict[str, Any]: data = { "valid": self.valid, "content": self.content, "files": [file.to_dict() for file in self.files], "diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics], "dependencies": self.dependencies, "provenance": [event.to_dict() for event in self.provenance], } return {key: value for key, value in data.items() if value not in (None, [], {})} @dataclass(frozen=True) class ProcessorRun: """Results from running all processor blocks in a document.""" source_path: str | None blocks: list[FencedProcessorBlock] results: list[ProcessorResult] @property def valid(self) -> bool: return all(result.valid for result in self.results) def to_dict(self) -> dict[str, Any]: return { "valid": self.valid, "source_path": self.source_path, "count": len(self.results), "blocks": [block.to_dict() for block in self.blocks], "results": [result.to_dict() for result in self.results], } class ProcessorRegistry: """Explicit registry for deterministic fenced-block processors.""" def __init__(self) -> None: self._processors: dict[str, ProcessorCallable] = {} def register(self, name: str, processor: ProcessorCallable) -> None: key = _slug(name) if not key: raise ValueError("Processor name cannot be empty") self._processors[key] = processor def names(self) -> list[str]: return sorted(self._processors) def run(self, request: ProcessorRequest) -> ProcessorResult: processor = self._processors.get(_slug(request.block.processor)) if processor is None: return ProcessorResult( diagnostics=[ Diagnostic( severity="error", code="processor.unknown", message=f"Unknown processor `{request.block.processor}`", source=SourceLocation( path=request.block.source_path, line=request.block.line_start, ), ) ] ) return processor(request) def default_processor_registry() -> ProcessorRegistry: """Create the default deterministic processor registry.""" registry = ProcessorRegistry() registry.register("identity", _identity_processor) registry.register("uppercase", _uppercase_processor) registry.register("include", _include_processor) return registry def discover_fenced_processors( markdown: str, *, source_path: str | Path | None = None, ) -> list[FencedProcessorBlock]: """Discover fenced blocks that explicitly opt into processor handling.""" parser = MarkdownIt("commonmark", {"tables": True}).enable("table") blocks: list[FencedProcessorBlock] = [] used_ids: dict[str, int] = {} for index, token in enumerate(parser.parse(markdown)): if token.type != "fence": continue attrs = _parse_fence_info(token.info) processor = _processor_name(attrs) if not processor: continue unit_id = _dedupe_id(_slug(attrs.get("id") or f"{processor}-{index}"), used_ids) line_start = token.map[0] + 1 if token.map else None line_end = token.map[1] if token.map else None blocks.append( FencedProcessorBlock( processor=processor, content=token.content, unit_id=unit_id, attrs={ key: value for key, value in attrs.items() if key not in {"id", "language", "processor"} }, language=attrs.get("language"), source_path=str(source_path) if source_path else None, line_start=line_start, line_end=line_end, content_hash=_hash_text(token.content), ) ) return blocks def run_fenced_processors( markdown: str, *, context: ProcessorContext, registry: ProcessorRegistry | None = None, source_path: str | Path | None = None, ) -> ProcessorRun: """Run all processor-marked fenced blocks in document order.""" active_registry = registry or default_processor_registry() blocks = discover_fenced_processors(markdown, source_path=source_path or context.current_path) results = [ active_registry.run(ProcessorRequest(block=block, context=context)) for block in blocks ] return ProcessorRun( source_path=str(source_path or context.current_path) if source_path or context.current_path else None, blocks=blocks, results=results, ) def _identity_processor(request: ProcessorRequest) -> ProcessorResult: return ProcessorResult( content=request.block.content, provenance=[ OperationProvenance( operation="processor.identity", source_path=request.block.source_path, line_start=request.block.line_start, line_end=request.block.line_end, metadata={"unit_id": request.block.unit_id}, ) ], ) def _uppercase_processor(request: ProcessorRequest) -> ProcessorResult: return ProcessorResult( content=request.block.content.upper(), provenance=[ OperationProvenance( operation="processor.uppercase", source_path=request.block.source_path, line_start=request.block.line_start, line_end=request.block.line_end, metadata={"unit_id": request.block.unit_id}, ) ], ) def _include_processor(request: ProcessorRequest) -> ProcessorResult: reference = request.block.attrs.get("ref") if not reference: return ProcessorResult( diagnostics=[ Diagnostic( severity="error", code="processor.include.missing_ref", message="Include processor requires a `ref` attribute", source=SourceLocation( path=request.block.source_path, line=request.block.line_start, ), ) ] ) try: resolution = resolve_reference(reference, context=request.context.reference_context()) except ReferenceResolutionError as exc: return ProcessorResult( diagnostics=[ Diagnostic( severity="error", code="processor.include.reference_error", message=str(exc), source=SourceLocation( path=request.block.source_path, line=request.block.line_start, ), ) ] ) content = "\n\n".join(unit.text for unit in resolution.units) return ProcessorResult( content=content, dependencies=[resolution.target_path], provenance=[ OperationProvenance( operation="processor.include", source_path=request.block.source_path, line_start=request.block.line_start, line_end=request.block.line_end, target_path=resolution.target_path, dependencies=[resolution.target_path], metadata={"ref": reference, "unit_ids": [unit.unit_id for unit in resolution.units]}, ) ], ) def _processor_name(attrs: dict[str, str]) -> str | None: if "processor" in attrs: return attrs["processor"] language = attrs.get("language", "") if language.startswith("mkt-"): return language.removeprefix("mkt-") if language == "mkt" and "type" in attrs: return attrs["type"] return None def _parse_fence_info(info: str) -> dict[str, str]: match = re.match(r"^(?P[^\s{]+)?(?:\s+\{(?P.*)\})?\s*$", info.strip()) if not match: return {"language": info.strip()} if info.strip() else {} attrs = _parse_attrs(match.group("attrs") or "") language = match.group("language") if language: attrs["language"] = language return attrs def _parse_attrs(raw: str) -> dict[str, str]: attrs: dict[str, str] = {} for part in shlex.split(raw): if part.startswith("#") and len(part) > 1: attrs["id"] = part[1:] continue if "=" not in part: attrs[part] = "true" continue key, value = part.split("=", 1) attrs[key.strip()] = value.strip() return attrs def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str: count = used_ids.get(unit_id, 0) + 1 used_ids[unit_id] = count return unit_id if count == 1 else f"{unit_id}-{count}" def _slug(value: str) -> str: slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower()) slug = re.sub(r"-+", "-", slug).strip("-") return slug def _hash_text(text: str) -> str: return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()