generated from coulomb/repo-seed
375 lines
12 KiB
Python
375 lines
12 KiB
Python
"""Processor API for deterministic fenced-block workflows."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import re
|
|
import shlex
|
|
from dataclasses import asdict, dataclass, field
|
|
from pathlib import Path
|
|
from typing import Any, Callable
|
|
|
|
from markdown_it import MarkdownIt
|
|
|
|
from markitect_tool.diagnostics import Diagnostic, SourceLocation
|
|
from markitect_tool.ops import OperationProvenance
|
|
from markitect_tool.reference import (
|
|
ReferenceContext,
|
|
ReferenceResolutionError,
|
|
resolve_reference,
|
|
)
|
|
|
|
|
|
ProcessorCallable = Callable[["ProcessorRequest"], "ProcessorResult"]
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class FencedProcessorBlock:
|
|
"""A fenced Markdown block that opted into processor handling."""
|
|
|
|
processor: str
|
|
content: str
|
|
unit_id: str
|
|
attrs: dict[str, str]
|
|
language: str | None = None
|
|
source_path: str | None = None
|
|
line_start: int | None = None
|
|
line_end: int | None = None
|
|
content_hash: str = ""
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {key: value for key, value in asdict(self).items() if value not in (None, {}, "")}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ProcessorContext:
|
|
"""Execution context passed to deterministic processors."""
|
|
|
|
root: Path = Path(".")
|
|
current_path: Path | None = None
|
|
namespaces: dict[str, str] = field(default_factory=dict)
|
|
variables: dict[str, Any] = field(default_factory=dict)
|
|
policy: dict[str, Any] = field(default_factory=dict)
|
|
|
|
def reference_context(self) -> ReferenceContext:
|
|
return ReferenceContext(
|
|
root=self.root,
|
|
current_path=self.current_path,
|
|
namespaces=self.namespaces,
|
|
)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
data = {
|
|
"root": str(self.root),
|
|
"current_path": str(self.current_path) if self.current_path else None,
|
|
"namespaces": self.namespaces,
|
|
"variables": self.variables,
|
|
"policy": self.policy,
|
|
}
|
|
return {key: value for key, value in data.items() if value not in (None, {}, "")}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ProcessorRequest:
|
|
"""One processor invocation."""
|
|
|
|
block: FencedProcessorBlock
|
|
context: ProcessorContext
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ProcessorOutputFile:
|
|
"""A generated file requested by a processor."""
|
|
|
|
path: str
|
|
content: str
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return asdict(self)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ProcessorResult:
|
|
"""Deterministic processor result envelope."""
|
|
|
|
content: str | None = None
|
|
files: list[ProcessorOutputFile] = field(default_factory=list)
|
|
diagnostics: list[Diagnostic] = field(default_factory=list)
|
|
dependencies: list[str] = field(default_factory=list)
|
|
provenance: list[OperationProvenance] = field(default_factory=list)
|
|
|
|
@property
|
|
def valid(self) -> bool:
|
|
return not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
data = {
|
|
"valid": self.valid,
|
|
"content": self.content,
|
|
"files": [file.to_dict() for file in self.files],
|
|
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
|
|
"dependencies": self.dependencies,
|
|
"provenance": [event.to_dict() for event in self.provenance],
|
|
}
|
|
return {key: value for key, value in data.items() if value not in (None, [], {})}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class ProcessorRun:
|
|
"""Results from running all processor blocks in a document."""
|
|
|
|
source_path: str | None
|
|
blocks: list[FencedProcessorBlock]
|
|
results: list[ProcessorResult]
|
|
|
|
@property
|
|
def valid(self) -> bool:
|
|
return all(result.valid for result in self.results)
|
|
|
|
def to_dict(self) -> dict[str, Any]:
|
|
return {
|
|
"valid": self.valid,
|
|
"source_path": self.source_path,
|
|
"count": len(self.results),
|
|
"blocks": [block.to_dict() for block in self.blocks],
|
|
"results": [result.to_dict() for result in self.results],
|
|
}
|
|
|
|
|
|
class ProcessorRegistry:
|
|
"""Explicit registry for deterministic fenced-block processors."""
|
|
|
|
def __init__(self) -> None:
|
|
self._processors: dict[str, ProcessorCallable] = {}
|
|
|
|
def register(self, name: str, processor: ProcessorCallable) -> None:
|
|
key = _slug(name)
|
|
if not key:
|
|
raise ValueError("Processor name cannot be empty")
|
|
self._processors[key] = processor
|
|
|
|
def names(self) -> list[str]:
|
|
return sorted(self._processors)
|
|
|
|
def run(self, request: ProcessorRequest) -> ProcessorResult:
|
|
processor = self._processors.get(_slug(request.block.processor))
|
|
if processor is None:
|
|
return ProcessorResult(
|
|
diagnostics=[
|
|
Diagnostic(
|
|
severity="error",
|
|
code="processor.unknown",
|
|
message=f"Unknown processor `{request.block.processor}`",
|
|
source=SourceLocation(
|
|
path=request.block.source_path,
|
|
line=request.block.line_start,
|
|
),
|
|
)
|
|
]
|
|
)
|
|
return processor(request)
|
|
|
|
|
|
def default_processor_registry() -> ProcessorRegistry:
|
|
"""Create the default deterministic processor registry."""
|
|
|
|
registry = ProcessorRegistry()
|
|
registry.register("identity", _identity_processor)
|
|
registry.register("uppercase", _uppercase_processor)
|
|
registry.register("include", _include_processor)
|
|
return registry
|
|
|
|
|
|
def discover_fenced_processors(
|
|
markdown: str,
|
|
*,
|
|
source_path: str | Path | None = None,
|
|
) -> list[FencedProcessorBlock]:
|
|
"""Discover fenced blocks that explicitly opt into processor handling."""
|
|
|
|
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
|
|
blocks: list[FencedProcessorBlock] = []
|
|
used_ids: dict[str, int] = {}
|
|
for index, token in enumerate(parser.parse(markdown)):
|
|
if token.type != "fence":
|
|
continue
|
|
attrs = _parse_fence_info(token.info)
|
|
processor = _processor_name(attrs)
|
|
if not processor:
|
|
continue
|
|
unit_id = _dedupe_id(_slug(attrs.get("id") or f"{processor}-{index}"), used_ids)
|
|
line_start = token.map[0] + 1 if token.map else None
|
|
line_end = token.map[1] if token.map else None
|
|
blocks.append(
|
|
FencedProcessorBlock(
|
|
processor=processor,
|
|
content=token.content,
|
|
unit_id=unit_id,
|
|
attrs={
|
|
key: value
|
|
for key, value in attrs.items()
|
|
if key not in {"id", "language", "processor"}
|
|
},
|
|
language=attrs.get("language"),
|
|
source_path=str(source_path) if source_path else None,
|
|
line_start=line_start,
|
|
line_end=line_end,
|
|
content_hash=_hash_text(token.content),
|
|
)
|
|
)
|
|
return blocks
|
|
|
|
|
|
def run_fenced_processors(
|
|
markdown: str,
|
|
*,
|
|
context: ProcessorContext,
|
|
registry: ProcessorRegistry | None = None,
|
|
source_path: str | Path | None = None,
|
|
) -> ProcessorRun:
|
|
"""Run all processor-marked fenced blocks in document order."""
|
|
|
|
active_registry = registry or default_processor_registry()
|
|
blocks = discover_fenced_processors(markdown, source_path=source_path or context.current_path)
|
|
results = [
|
|
active_registry.run(ProcessorRequest(block=block, context=context))
|
|
for block in blocks
|
|
]
|
|
return ProcessorRun(
|
|
source_path=str(source_path or context.current_path) if source_path or context.current_path else None,
|
|
blocks=blocks,
|
|
results=results,
|
|
)
|
|
|
|
|
|
def _identity_processor(request: ProcessorRequest) -> ProcessorResult:
|
|
return ProcessorResult(
|
|
content=request.block.content,
|
|
provenance=[
|
|
OperationProvenance(
|
|
operation="processor.identity",
|
|
source_path=request.block.source_path,
|
|
line_start=request.block.line_start,
|
|
line_end=request.block.line_end,
|
|
metadata={"unit_id": request.block.unit_id},
|
|
)
|
|
],
|
|
)
|
|
|
|
|
|
def _uppercase_processor(request: ProcessorRequest) -> ProcessorResult:
|
|
return ProcessorResult(
|
|
content=request.block.content.upper(),
|
|
provenance=[
|
|
OperationProvenance(
|
|
operation="processor.uppercase",
|
|
source_path=request.block.source_path,
|
|
line_start=request.block.line_start,
|
|
line_end=request.block.line_end,
|
|
metadata={"unit_id": request.block.unit_id},
|
|
)
|
|
],
|
|
)
|
|
|
|
|
|
def _include_processor(request: ProcessorRequest) -> ProcessorResult:
|
|
reference = request.block.attrs.get("ref")
|
|
if not reference:
|
|
return ProcessorResult(
|
|
diagnostics=[
|
|
Diagnostic(
|
|
severity="error",
|
|
code="processor.include.missing_ref",
|
|
message="Include processor requires a `ref` attribute",
|
|
source=SourceLocation(
|
|
path=request.block.source_path,
|
|
line=request.block.line_start,
|
|
),
|
|
)
|
|
]
|
|
)
|
|
try:
|
|
resolution = resolve_reference(reference, context=request.context.reference_context())
|
|
except ReferenceResolutionError as exc:
|
|
return ProcessorResult(
|
|
diagnostics=[
|
|
Diagnostic(
|
|
severity="error",
|
|
code="processor.include.reference_error",
|
|
message=str(exc),
|
|
source=SourceLocation(
|
|
path=request.block.source_path,
|
|
line=request.block.line_start,
|
|
),
|
|
)
|
|
]
|
|
)
|
|
content = "\n\n".join(unit.text for unit in resolution.units)
|
|
return ProcessorResult(
|
|
content=content,
|
|
dependencies=[resolution.target_path],
|
|
provenance=[
|
|
OperationProvenance(
|
|
operation="processor.include",
|
|
source_path=request.block.source_path,
|
|
line_start=request.block.line_start,
|
|
line_end=request.block.line_end,
|
|
target_path=resolution.target_path,
|
|
dependencies=[resolution.target_path],
|
|
metadata={"ref": reference, "unit_ids": [unit.unit_id for unit in resolution.units]},
|
|
)
|
|
],
|
|
)
|
|
|
|
|
|
def _processor_name(attrs: dict[str, str]) -> str | None:
|
|
if "processor" in attrs:
|
|
return attrs["processor"]
|
|
language = attrs.get("language", "")
|
|
if language.startswith("mkt-"):
|
|
return language.removeprefix("mkt-")
|
|
if language == "mkt" and "type" in attrs:
|
|
return attrs["type"]
|
|
return None
|
|
|
|
|
|
def _parse_fence_info(info: str) -> dict[str, str]:
|
|
match = re.match(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$", info.strip())
|
|
if not match:
|
|
return {"language": info.strip()} if info.strip() else {}
|
|
attrs = _parse_attrs(match.group("attrs") or "")
|
|
language = match.group("language")
|
|
if language:
|
|
attrs["language"] = language
|
|
return attrs
|
|
|
|
|
|
def _parse_attrs(raw: str) -> dict[str, str]:
|
|
attrs: dict[str, str] = {}
|
|
for part in shlex.split(raw):
|
|
if part.startswith("#") and len(part) > 1:
|
|
attrs["id"] = part[1:]
|
|
continue
|
|
if "=" not in part:
|
|
attrs[part] = "true"
|
|
continue
|
|
key, value = part.split("=", 1)
|
|
attrs[key.strip()] = value.strip()
|
|
return attrs
|
|
|
|
|
|
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
|
|
count = used_ids.get(unit_id, 0) + 1
|
|
used_ids[unit_id] = count
|
|
return unit_id if count == 1 else f"{unit_id}-{count}"
|
|
|
|
|
|
def _slug(value: str) -> str:
|
|
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
|
|
slug = re.sub(r"-+", "-", slug).strip("-")
|
|
return slug
|
|
|
|
|
|
def _hash_text(text: str) -> str:
|
|
return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
|