generated from coulomb/repo-seed
extension for ref resolve, explode, implode, weave, tangle
This commit is contained in:
27
src/markitect_tool/processor/__init__.py
Normal file
27
src/markitect_tool/processor/__init__.py
Normal file
@@ -0,0 +1,27 @@
|
||||
"""Deterministic fenced-block processor registry."""
|
||||
|
||||
from markitect_tool.processor.engine import (
|
||||
FencedProcessorBlock,
|
||||
ProcessorContext,
|
||||
ProcessorOutputFile,
|
||||
ProcessorRegistry,
|
||||
ProcessorRequest,
|
||||
ProcessorResult,
|
||||
ProcessorRun,
|
||||
default_processor_registry,
|
||||
discover_fenced_processors,
|
||||
run_fenced_processors,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"FencedProcessorBlock",
|
||||
"ProcessorContext",
|
||||
"ProcessorOutputFile",
|
||||
"ProcessorRegistry",
|
||||
"ProcessorRequest",
|
||||
"ProcessorResult",
|
||||
"ProcessorRun",
|
||||
"default_processor_registry",
|
||||
"discover_fenced_processors",
|
||||
"run_fenced_processors",
|
||||
]
|
||||
374
src/markitect_tool/processor/engine.py
Normal file
374
src/markitect_tool/processor/engine.py
Normal file
@@ -0,0 +1,374 @@
|
||||
"""Processor API for deterministic fenced-block workflows."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
import shlex
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable
|
||||
|
||||
from markdown_it import MarkdownIt
|
||||
|
||||
from markitect_tool.diagnostics import Diagnostic, SourceLocation
|
||||
from markitect_tool.ops import OperationProvenance
|
||||
from markitect_tool.reference import (
|
||||
ReferenceContext,
|
||||
ReferenceResolutionError,
|
||||
resolve_reference,
|
||||
)
|
||||
|
||||
|
||||
ProcessorCallable = Callable[["ProcessorRequest"], "ProcessorResult"]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class FencedProcessorBlock:
|
||||
"""A fenced Markdown block that opted into processor handling."""
|
||||
|
||||
processor: str
|
||||
content: str
|
||||
unit_id: str
|
||||
attrs: dict[str, str]
|
||||
language: str | None = None
|
||||
source_path: str | None = None
|
||||
line_start: int | None = None
|
||||
line_end: int | None = None
|
||||
content_hash: str = ""
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {key: value for key, value in asdict(self).items() if value not in (None, {}, "")}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProcessorContext:
|
||||
"""Execution context passed to deterministic processors."""
|
||||
|
||||
root: Path = Path(".")
|
||||
current_path: Path | None = None
|
||||
namespaces: dict[str, str] = field(default_factory=dict)
|
||||
variables: dict[str, Any] = field(default_factory=dict)
|
||||
policy: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def reference_context(self) -> ReferenceContext:
|
||||
return ReferenceContext(
|
||||
root=self.root,
|
||||
current_path=self.current_path,
|
||||
namespaces=self.namespaces,
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
data = {
|
||||
"root": str(self.root),
|
||||
"current_path": str(self.current_path) if self.current_path else None,
|
||||
"namespaces": self.namespaces,
|
||||
"variables": self.variables,
|
||||
"policy": self.policy,
|
||||
}
|
||||
return {key: value for key, value in data.items() if value not in (None, {}, "")}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProcessorRequest:
|
||||
"""One processor invocation."""
|
||||
|
||||
block: FencedProcessorBlock
|
||||
context: ProcessorContext
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProcessorOutputFile:
|
||||
"""A generated file requested by a processor."""
|
||||
|
||||
path: str
|
||||
content: str
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProcessorResult:
|
||||
"""Deterministic processor result envelope."""
|
||||
|
||||
content: str | None = None
|
||||
files: list[ProcessorOutputFile] = field(default_factory=list)
|
||||
diagnostics: list[Diagnostic] = field(default_factory=list)
|
||||
dependencies: list[str] = field(default_factory=list)
|
||||
provenance: list[OperationProvenance] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def valid(self) -> bool:
|
||||
return not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
data = {
|
||||
"valid": self.valid,
|
||||
"content": self.content,
|
||||
"files": [file.to_dict() for file in self.files],
|
||||
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
|
||||
"dependencies": self.dependencies,
|
||||
"provenance": [event.to_dict() for event in self.provenance],
|
||||
}
|
||||
return {key: value for key, value in data.items() if value not in (None, [], {})}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ProcessorRun:
|
||||
"""Results from running all processor blocks in a document."""
|
||||
|
||||
source_path: str | None
|
||||
blocks: list[FencedProcessorBlock]
|
||||
results: list[ProcessorResult]
|
||||
|
||||
@property
|
||||
def valid(self) -> bool:
|
||||
return all(result.valid for result in self.results)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"valid": self.valid,
|
||||
"source_path": self.source_path,
|
||||
"count": len(self.results),
|
||||
"blocks": [block.to_dict() for block in self.blocks],
|
||||
"results": [result.to_dict() for result in self.results],
|
||||
}
|
||||
|
||||
|
||||
class ProcessorRegistry:
|
||||
"""Explicit registry for deterministic fenced-block processors."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._processors: dict[str, ProcessorCallable] = {}
|
||||
|
||||
def register(self, name: str, processor: ProcessorCallable) -> None:
|
||||
key = _slug(name)
|
||||
if not key:
|
||||
raise ValueError("Processor name cannot be empty")
|
||||
self._processors[key] = processor
|
||||
|
||||
def names(self) -> list[str]:
|
||||
return sorted(self._processors)
|
||||
|
||||
def run(self, request: ProcessorRequest) -> ProcessorResult:
|
||||
processor = self._processors.get(_slug(request.block.processor))
|
||||
if processor is None:
|
||||
return ProcessorResult(
|
||||
diagnostics=[
|
||||
Diagnostic(
|
||||
severity="error",
|
||||
code="processor.unknown",
|
||||
message=f"Unknown processor `{request.block.processor}`",
|
||||
source=SourceLocation(
|
||||
path=request.block.source_path,
|
||||
line=request.block.line_start,
|
||||
),
|
||||
)
|
||||
]
|
||||
)
|
||||
return processor(request)
|
||||
|
||||
|
||||
def default_processor_registry() -> ProcessorRegistry:
|
||||
"""Create the default deterministic processor registry."""
|
||||
|
||||
registry = ProcessorRegistry()
|
||||
registry.register("identity", _identity_processor)
|
||||
registry.register("uppercase", _uppercase_processor)
|
||||
registry.register("include", _include_processor)
|
||||
return registry
|
||||
|
||||
|
||||
def discover_fenced_processors(
|
||||
markdown: str,
|
||||
*,
|
||||
source_path: str | Path | None = None,
|
||||
) -> list[FencedProcessorBlock]:
|
||||
"""Discover fenced blocks that explicitly opt into processor handling."""
|
||||
|
||||
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
|
||||
blocks: list[FencedProcessorBlock] = []
|
||||
used_ids: dict[str, int] = {}
|
||||
for index, token in enumerate(parser.parse(markdown)):
|
||||
if token.type != "fence":
|
||||
continue
|
||||
attrs = _parse_fence_info(token.info)
|
||||
processor = _processor_name(attrs)
|
||||
if not processor:
|
||||
continue
|
||||
unit_id = _dedupe_id(_slug(attrs.get("id") or f"{processor}-{index}"), used_ids)
|
||||
line_start = token.map[0] + 1 if token.map else None
|
||||
line_end = token.map[1] if token.map else None
|
||||
blocks.append(
|
||||
FencedProcessorBlock(
|
||||
processor=processor,
|
||||
content=token.content,
|
||||
unit_id=unit_id,
|
||||
attrs={
|
||||
key: value
|
||||
for key, value in attrs.items()
|
||||
if key not in {"id", "language", "processor"}
|
||||
},
|
||||
language=attrs.get("language"),
|
||||
source_path=str(source_path) if source_path else None,
|
||||
line_start=line_start,
|
||||
line_end=line_end,
|
||||
content_hash=_hash_text(token.content),
|
||||
)
|
||||
)
|
||||
return blocks
|
||||
|
||||
|
||||
def run_fenced_processors(
|
||||
markdown: str,
|
||||
*,
|
||||
context: ProcessorContext,
|
||||
registry: ProcessorRegistry | None = None,
|
||||
source_path: str | Path | None = None,
|
||||
) -> ProcessorRun:
|
||||
"""Run all processor-marked fenced blocks in document order."""
|
||||
|
||||
active_registry = registry or default_processor_registry()
|
||||
blocks = discover_fenced_processors(markdown, source_path=source_path or context.current_path)
|
||||
results = [
|
||||
active_registry.run(ProcessorRequest(block=block, context=context))
|
||||
for block in blocks
|
||||
]
|
||||
return ProcessorRun(
|
||||
source_path=str(source_path or context.current_path) if source_path or context.current_path else None,
|
||||
blocks=blocks,
|
||||
results=results,
|
||||
)
|
||||
|
||||
|
||||
def _identity_processor(request: ProcessorRequest) -> ProcessorResult:
|
||||
return ProcessorResult(
|
||||
content=request.block.content,
|
||||
provenance=[
|
||||
OperationProvenance(
|
||||
operation="processor.identity",
|
||||
source_path=request.block.source_path,
|
||||
line_start=request.block.line_start,
|
||||
line_end=request.block.line_end,
|
||||
metadata={"unit_id": request.block.unit_id},
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def _uppercase_processor(request: ProcessorRequest) -> ProcessorResult:
|
||||
return ProcessorResult(
|
||||
content=request.block.content.upper(),
|
||||
provenance=[
|
||||
OperationProvenance(
|
||||
operation="processor.uppercase",
|
||||
source_path=request.block.source_path,
|
||||
line_start=request.block.line_start,
|
||||
line_end=request.block.line_end,
|
||||
metadata={"unit_id": request.block.unit_id},
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def _include_processor(request: ProcessorRequest) -> ProcessorResult:
|
||||
reference = request.block.attrs.get("ref")
|
||||
if not reference:
|
||||
return ProcessorResult(
|
||||
diagnostics=[
|
||||
Diagnostic(
|
||||
severity="error",
|
||||
code="processor.include.missing_ref",
|
||||
message="Include processor requires a `ref` attribute",
|
||||
source=SourceLocation(
|
||||
path=request.block.source_path,
|
||||
line=request.block.line_start,
|
||||
),
|
||||
)
|
||||
]
|
||||
)
|
||||
try:
|
||||
resolution = resolve_reference(reference, context=request.context.reference_context())
|
||||
except ReferenceResolutionError as exc:
|
||||
return ProcessorResult(
|
||||
diagnostics=[
|
||||
Diagnostic(
|
||||
severity="error",
|
||||
code="processor.include.reference_error",
|
||||
message=str(exc),
|
||||
source=SourceLocation(
|
||||
path=request.block.source_path,
|
||||
line=request.block.line_start,
|
||||
),
|
||||
)
|
||||
]
|
||||
)
|
||||
content = "\n\n".join(unit.text for unit in resolution.units)
|
||||
return ProcessorResult(
|
||||
content=content,
|
||||
dependencies=[resolution.target_path],
|
||||
provenance=[
|
||||
OperationProvenance(
|
||||
operation="processor.include",
|
||||
source_path=request.block.source_path,
|
||||
line_start=request.block.line_start,
|
||||
line_end=request.block.line_end,
|
||||
target_path=resolution.target_path,
|
||||
dependencies=[resolution.target_path],
|
||||
metadata={"ref": reference, "unit_ids": [unit.unit_id for unit in resolution.units]},
|
||||
)
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
def _processor_name(attrs: dict[str, str]) -> str | None:
|
||||
if "processor" in attrs:
|
||||
return attrs["processor"]
|
||||
language = attrs.get("language", "")
|
||||
if language.startswith("mkt-"):
|
||||
return language.removeprefix("mkt-")
|
||||
if language == "mkt" and "type" in attrs:
|
||||
return attrs["type"]
|
||||
return None
|
||||
|
||||
|
||||
def _parse_fence_info(info: str) -> dict[str, str]:
|
||||
match = re.match(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$", info.strip())
|
||||
if not match:
|
||||
return {"language": info.strip()} if info.strip() else {}
|
||||
attrs = _parse_attrs(match.group("attrs") or "")
|
||||
language = match.group("language")
|
||||
if language:
|
||||
attrs["language"] = language
|
||||
return attrs
|
||||
|
||||
|
||||
def _parse_attrs(raw: str) -> dict[str, str]:
|
||||
attrs: dict[str, str] = {}
|
||||
for part in shlex.split(raw):
|
||||
if part.startswith("#") and len(part) > 1:
|
||||
attrs["id"] = part[1:]
|
||||
continue
|
||||
if "=" not in part:
|
||||
attrs[part] = "true"
|
||||
continue
|
||||
key, value = part.split("=", 1)
|
||||
attrs[key.strip()] = value.strip()
|
||||
return attrs
|
||||
|
||||
|
||||
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
|
||||
count = used_ids.get(unit_id, 0) + 1
|
||||
used_ids[unit_id] = count
|
||||
return unit_id if count == 1 else f"{unit_id}-{count}"
|
||||
|
||||
|
||||
def _slug(value: str) -> str:
|
||||
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
|
||||
slug = re.sub(r"-+", "-", slug).strip("-")
|
||||
return slug
|
||||
|
||||
|
||||
def _hash_text(text: str) -> str:
|
||||
return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()
|
||||
Reference in New Issue
Block a user