extension for ref resolve, explode, implode, weave, tangle

This commit is contained in:
2026-05-04 02:25:49 +02:00
parent 8203f50fd5
commit 65bfc1aebf
39 changed files with 3959 additions and 25 deletions

View File

@@ -0,0 +1,27 @@
"""Deterministic fenced-block processor registry."""
from markitect_tool.processor.engine import (
FencedProcessorBlock,
ProcessorContext,
ProcessorOutputFile,
ProcessorRegistry,
ProcessorRequest,
ProcessorResult,
ProcessorRun,
default_processor_registry,
discover_fenced_processors,
run_fenced_processors,
)
__all__ = [
"FencedProcessorBlock",
"ProcessorContext",
"ProcessorOutputFile",
"ProcessorRegistry",
"ProcessorRequest",
"ProcessorResult",
"ProcessorRun",
"default_processor_registry",
"discover_fenced_processors",
"run_fenced_processors",
]

View File

@@ -0,0 +1,374 @@
"""Processor API for deterministic fenced-block workflows."""
from __future__ import annotations
import hashlib
import re
import shlex
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Callable
from markdown_it import MarkdownIt
from markitect_tool.diagnostics import Diagnostic, SourceLocation
from markitect_tool.ops import OperationProvenance
from markitect_tool.reference import (
ReferenceContext,
ReferenceResolutionError,
resolve_reference,
)
ProcessorCallable = Callable[["ProcessorRequest"], "ProcessorResult"]
@dataclass(frozen=True)
class FencedProcessorBlock:
"""A fenced Markdown block that opted into processor handling."""
processor: str
content: str
unit_id: str
attrs: dict[str, str]
language: str | None = None
source_path: str | None = None
line_start: int | None = None
line_end: int | None = None
content_hash: str = ""
def to_dict(self) -> dict[str, Any]:
return {key: value for key, value in asdict(self).items() if value not in (None, {}, "")}
@dataclass(frozen=True)
class ProcessorContext:
"""Execution context passed to deterministic processors."""
root: Path = Path(".")
current_path: Path | None = None
namespaces: dict[str, str] = field(default_factory=dict)
variables: dict[str, Any] = field(default_factory=dict)
policy: dict[str, Any] = field(default_factory=dict)
def reference_context(self) -> ReferenceContext:
return ReferenceContext(
root=self.root,
current_path=self.current_path,
namespaces=self.namespaces,
)
def to_dict(self) -> dict[str, Any]:
data = {
"root": str(self.root),
"current_path": str(self.current_path) if self.current_path else None,
"namespaces": self.namespaces,
"variables": self.variables,
"policy": self.policy,
}
return {key: value for key, value in data.items() if value not in (None, {}, "")}
@dataclass(frozen=True)
class ProcessorRequest:
"""One processor invocation."""
block: FencedProcessorBlock
context: ProcessorContext
@dataclass(frozen=True)
class ProcessorOutputFile:
"""A generated file requested by a processor."""
path: str
content: str
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@dataclass(frozen=True)
class ProcessorResult:
"""Deterministic processor result envelope."""
content: str | None = None
files: list[ProcessorOutputFile] = field(default_factory=list)
diagnostics: list[Diagnostic] = field(default_factory=list)
dependencies: list[str] = field(default_factory=list)
provenance: list[OperationProvenance] = field(default_factory=list)
@property
def valid(self) -> bool:
return not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)
def to_dict(self) -> dict[str, Any]:
data = {
"valid": self.valid,
"content": self.content,
"files": [file.to_dict() for file in self.files],
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
"dependencies": self.dependencies,
"provenance": [event.to_dict() for event in self.provenance],
}
return {key: value for key, value in data.items() if value not in (None, [], {})}
@dataclass(frozen=True)
class ProcessorRun:
"""Results from running all processor blocks in a document."""
source_path: str | None
blocks: list[FencedProcessorBlock]
results: list[ProcessorResult]
@property
def valid(self) -> bool:
return all(result.valid for result in self.results)
def to_dict(self) -> dict[str, Any]:
return {
"valid": self.valid,
"source_path": self.source_path,
"count": len(self.results),
"blocks": [block.to_dict() for block in self.blocks],
"results": [result.to_dict() for result in self.results],
}
class ProcessorRegistry:
"""Explicit registry for deterministic fenced-block processors."""
def __init__(self) -> None:
self._processors: dict[str, ProcessorCallable] = {}
def register(self, name: str, processor: ProcessorCallable) -> None:
key = _slug(name)
if not key:
raise ValueError("Processor name cannot be empty")
self._processors[key] = processor
def names(self) -> list[str]:
return sorted(self._processors)
def run(self, request: ProcessorRequest) -> ProcessorResult:
processor = self._processors.get(_slug(request.block.processor))
if processor is None:
return ProcessorResult(
diagnostics=[
Diagnostic(
severity="error",
code="processor.unknown",
message=f"Unknown processor `{request.block.processor}`",
source=SourceLocation(
path=request.block.source_path,
line=request.block.line_start,
),
)
]
)
return processor(request)
def default_processor_registry() -> ProcessorRegistry:
"""Create the default deterministic processor registry."""
registry = ProcessorRegistry()
registry.register("identity", _identity_processor)
registry.register("uppercase", _uppercase_processor)
registry.register("include", _include_processor)
return registry
def discover_fenced_processors(
markdown: str,
*,
source_path: str | Path | None = None,
) -> list[FencedProcessorBlock]:
"""Discover fenced blocks that explicitly opt into processor handling."""
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
blocks: list[FencedProcessorBlock] = []
used_ids: dict[str, int] = {}
for index, token in enumerate(parser.parse(markdown)):
if token.type != "fence":
continue
attrs = _parse_fence_info(token.info)
processor = _processor_name(attrs)
if not processor:
continue
unit_id = _dedupe_id(_slug(attrs.get("id") or f"{processor}-{index}"), used_ids)
line_start = token.map[0] + 1 if token.map else None
line_end = token.map[1] if token.map else None
blocks.append(
FencedProcessorBlock(
processor=processor,
content=token.content,
unit_id=unit_id,
attrs={
key: value
for key, value in attrs.items()
if key not in {"id", "language", "processor"}
},
language=attrs.get("language"),
source_path=str(source_path) if source_path else None,
line_start=line_start,
line_end=line_end,
content_hash=_hash_text(token.content),
)
)
return blocks
def run_fenced_processors(
markdown: str,
*,
context: ProcessorContext,
registry: ProcessorRegistry | None = None,
source_path: str | Path | None = None,
) -> ProcessorRun:
"""Run all processor-marked fenced blocks in document order."""
active_registry = registry or default_processor_registry()
blocks = discover_fenced_processors(markdown, source_path=source_path or context.current_path)
results = [
active_registry.run(ProcessorRequest(block=block, context=context))
for block in blocks
]
return ProcessorRun(
source_path=str(source_path or context.current_path) if source_path or context.current_path else None,
blocks=blocks,
results=results,
)
def _identity_processor(request: ProcessorRequest) -> ProcessorResult:
return ProcessorResult(
content=request.block.content,
provenance=[
OperationProvenance(
operation="processor.identity",
source_path=request.block.source_path,
line_start=request.block.line_start,
line_end=request.block.line_end,
metadata={"unit_id": request.block.unit_id},
)
],
)
def _uppercase_processor(request: ProcessorRequest) -> ProcessorResult:
return ProcessorResult(
content=request.block.content.upper(),
provenance=[
OperationProvenance(
operation="processor.uppercase",
source_path=request.block.source_path,
line_start=request.block.line_start,
line_end=request.block.line_end,
metadata={"unit_id": request.block.unit_id},
)
],
)
def _include_processor(request: ProcessorRequest) -> ProcessorResult:
reference = request.block.attrs.get("ref")
if not reference:
return ProcessorResult(
diagnostics=[
Diagnostic(
severity="error",
code="processor.include.missing_ref",
message="Include processor requires a `ref` attribute",
source=SourceLocation(
path=request.block.source_path,
line=request.block.line_start,
),
)
]
)
try:
resolution = resolve_reference(reference, context=request.context.reference_context())
except ReferenceResolutionError as exc:
return ProcessorResult(
diagnostics=[
Diagnostic(
severity="error",
code="processor.include.reference_error",
message=str(exc),
source=SourceLocation(
path=request.block.source_path,
line=request.block.line_start,
),
)
]
)
content = "\n\n".join(unit.text for unit in resolution.units)
return ProcessorResult(
content=content,
dependencies=[resolution.target_path],
provenance=[
OperationProvenance(
operation="processor.include",
source_path=request.block.source_path,
line_start=request.block.line_start,
line_end=request.block.line_end,
target_path=resolution.target_path,
dependencies=[resolution.target_path],
metadata={"ref": reference, "unit_ids": [unit.unit_id for unit in resolution.units]},
)
],
)
def _processor_name(attrs: dict[str, str]) -> str | None:
if "processor" in attrs:
return attrs["processor"]
language = attrs.get("language", "")
if language.startswith("mkt-"):
return language.removeprefix("mkt-")
if language == "mkt" and "type" in attrs:
return attrs["type"]
return None
def _parse_fence_info(info: str) -> dict[str, str]:
match = re.match(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$", info.strip())
if not match:
return {"language": info.strip()} if info.strip() else {}
attrs = _parse_attrs(match.group("attrs") or "")
language = match.group("language")
if language:
attrs["language"] = language
return attrs
def _parse_attrs(raw: str) -> dict[str, str]:
attrs: dict[str, str] = {}
for part in shlex.split(raw):
if part.startswith("#") and len(part) > 1:
attrs["id"] = part[1:]
continue
if "=" not in part:
attrs[part] = "true"
continue
key, value = part.split("=", 1)
attrs[key.strip()] = value.strip()
return attrs
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
count = used_ids.get(unit_id, 0) + 1
used_ids[unit_id] = count
return unit_id if count == 1 else f"{unit_id}-{count}"
def _slug(value: str) -> str:
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
slug = re.sub(r"-+", "-", slug).strip("-")
return slug
def _hash_text(text: str) -> str:
return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()