extension for ref resolve, explode, implode, weave, tangle

This commit is contained in:
2026-05-04 02:25:49 +02:00
parent 8203f50fd5
commit 65bfc1aebf
39 changed files with 3959 additions and 25 deletions

View File

@@ -32,7 +32,26 @@ from markitect_tool.cache import (
save_cache,
scan_markdown_files,
)
from markitect_tool.content_class import (
ClassCompositionResult,
ContentClass,
ContentClassRegistry,
ContentClassResolutionError,
load_content_class_file,
load_content_classes,
)
from markitect_tool.diagnostics import Diagnostic, SourceLocation
from markitect_tool.explode import (
EXPLODE_MANIFEST_NAME,
ExplodeEntry,
ExplodeError,
ExplodeManifest,
ExplodeResult,
ImplodeResult,
explode_markdown_file,
implode_markdown_directory,
load_explode_manifest,
)
from markitect_tool.generation import (
GeneratedDocument,
GenerationHookRequest,
@@ -44,21 +63,55 @@ from markitect_tool.generation import (
load_generation_plan_file,
run_generation_plan,
)
from markitect_tool.literate import (
CodeChunk,
LiterateFile,
TangleResult,
WeaveResult,
discover_code_chunks,
tangle_markdown,
weave_markdown,
write_tangle_files,
)
from markitect_tool.ops import (
ComposeResult,
IncludeError,
IncludeResult,
OperationProvenance,
TransformResult,
compose_files,
resolve_includes,
transform_markdown,
)
from markitect_tool.processor import (
FencedProcessorBlock,
ProcessorContext,
ProcessorOutputFile,
ProcessorRegistry,
ProcessorRequest,
ProcessorResult,
ProcessorRun,
default_processor_registry,
discover_fenced_processors,
run_fenced_processors,
)
from markitect_tool.query import (
InvalidQueryError,
QueryMatch,
extract_document,
query_document,
)
from markitect_tool.reference import (
ContentUnit,
ReferenceAddress,
ReferenceContext,
ReferenceResolution,
ReferenceResolutionError,
SourceSpan as ReferenceSourceSpan,
load_namespaces,
parse_reference,
resolve_reference,
)
from markitect_tool.schema import (
MarkdownSchema,
SchemaValidationResult,
@@ -109,8 +162,23 @@ __all__ = [
"load_cache",
"save_cache",
"scan_markdown_files",
"ClassCompositionResult",
"ContentClass",
"ContentClassRegistry",
"ContentClassResolutionError",
"load_content_class_file",
"load_content_classes",
"Diagnostic",
"SourceLocation",
"EXPLODE_MANIFEST_NAME",
"ExplodeEntry",
"ExplodeError",
"ExplodeManifest",
"ExplodeResult",
"ImplodeResult",
"explode_markdown_file",
"implode_markdown_directory",
"load_explode_manifest",
"GeneratedDocument",
"GenerationHookRequest",
"GenerationHookResult",
@@ -120,17 +188,45 @@ __all__ = [
"generate_with_hook",
"load_generation_plan_file",
"run_generation_plan",
"CodeChunk",
"LiterateFile",
"TangleResult",
"WeaveResult",
"discover_code_chunks",
"tangle_markdown",
"weave_markdown",
"write_tangle_files",
"ComposeResult",
"IncludeError",
"IncludeResult",
"OperationProvenance",
"TransformResult",
"compose_files",
"resolve_includes",
"transform_markdown",
"FencedProcessorBlock",
"ProcessorContext",
"ProcessorOutputFile",
"ProcessorRegistry",
"ProcessorRequest",
"ProcessorResult",
"ProcessorRun",
"default_processor_registry",
"discover_fenced_processors",
"run_fenced_processors",
"InvalidQueryError",
"QueryMatch",
"extract_document",
"query_document",
"ContentUnit",
"ReferenceAddress",
"ReferenceContext",
"ReferenceResolution",
"ReferenceResolutionError",
"ReferenceSourceSpan",
"load_namespaces",
"parse_reference",
"resolve_reference",
"MissingTemplateVariable",
"TemplateAnalysis",
"TemplateError",

View File

@@ -16,6 +16,10 @@ from markitect_tool.cache import (
load_cache,
save_cache,
)
from markitect_tool.content_class import (
ContentClassResolutionError,
load_content_class_file,
)
from markitect_tool.core import parse_markdown_file
from markitect_tool.contract import (
ContractLoaderError,
@@ -24,6 +28,11 @@ from markitect_tool.contract import (
load_contract_file,
validate_contract,
)
from markitect_tool.explode import (
ExplodeError,
explode_markdown_file,
implode_markdown_directory,
)
from markitect_tool.generation import (
GenerationPlanError,
generate_stub_from_contract,
@@ -31,8 +40,16 @@ from markitect_tool.generation import (
load_generation_plan_file,
run_generation_plan,
)
from markitect_tool.literate import tangle_markdown, weave_markdown, write_tangle_files
from markitect_tool.ops import IncludeError, compose_files, resolve_includes, transform_markdown
from markitect_tool.processor import ProcessorContext, run_fenced_processors
from markitect_tool.query import InvalidQueryError, extract_document, query_document
from markitect_tool.reference import (
ReferenceContext,
ReferenceResolutionError,
load_namespaces,
resolve_reference,
)
from markitect_tool.schema import load_schema_file, validate_markdown_file, validate_schema
from markitect_tool.template import (
MissingTemplateVariable,
@@ -296,6 +313,224 @@ def include(
_emit_markdown_result(result.to_dict(), output_format, output)
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--output-dir",
required=True,
type=click.Path(file_okay=False, path_type=Path),
help="Directory to write exploded Markdown files and manifest into.",
)
@click.option(
"--variant",
type=click.Choice(["flat", "hierarchical"], case_sensitive=False),
default="flat",
show_default=True,
)
@click.option("--force", is_flag=True, help="Allow writing into a non-empty output directory.")
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def explode(
file: Path,
output_dir: Path,
variant: str,
force: bool,
output_format: str,
) -> None:
"""Explode a Markdown file into reversible section files."""
try:
result = explode_markdown_file(file, output_dir, variant=variant, overwrite=force)
except ExplodeError as exc:
raise click.ClickException(str(exc)) from exc
_emit_explode_result(result.to_dict(), output_format)
@main.command()
@click.argument("directory", type=click.Path(exists=True, file_okay=False, path_type=Path))
@click.option(
"--manifest",
"manifest_path",
type=click.Path(exists=True, dir_okay=False, path_type=Path),
help="Manifest path. Defaults to markitect-explode.yaml in the input directory.",
)
@click.option(
"--output",
type=click.Path(dir_okay=False, path_type=Path),
help="Write imploded Markdown to a file.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["markdown", "json", "yaml"], case_sensitive=False),
default="markdown",
show_default=True,
)
def implode(
directory: Path,
manifest_path: Path | None,
output: Path | None,
output_format: str,
) -> None:
"""Implode a Markdown directory created by `mkt explode`."""
try:
result = implode_markdown_directory(directory, manifest_path=manifest_path)
except ExplodeError as exc:
raise click.ClickException(str(exc)) from exc
_emit_markdown_result(result.to_dict(), output_format, output)
@main.group("ref")
def ref_group() -> None:
"""Resolve namespaced Markdown content references."""
@ref_group.command("resolve")
@click.argument("context_file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.argument("reference")
@click.option(
"--root",
type=click.Path(exists=True, file_okay=False, path_type=Path),
default=Path("."),
show_default=True,
help="Root that relative paths and namespaces must stay within.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def ref_resolve(context_file: Path, reference: str, root: Path, output_format: str) -> None:
"""Resolve a content reference using a Markdown document as context."""
context_document = parse_markdown_file(context_file)
context = ReferenceContext.from_document(
context_document,
root=root,
current_path=context_file,
)
try:
resolution = resolve_reference(reference, context=context)
except ReferenceResolutionError as exc:
raise click.ClickException(str(exc)) from exc
_emit_reference_result(resolution.to_dict(), output_format)
@main.command("process")
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--root",
type=click.Path(exists=True, file_okay=False, path_type=Path),
default=Path("."),
show_default=True,
help="Root used for relative processor references.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def process(file: Path, root: Path, output_format: str) -> None:
"""Run deterministic fenced-block processors in a Markdown file."""
document = parse_markdown_file(file)
context = ProcessorContext(
root=root,
current_path=file,
namespaces=load_namespaces(document.frontmatter),
)
result = run_fenced_processors(
file.read_text(encoding="utf-8"),
context=context,
source_path=file,
)
_emit_processor_run(result.to_dict(), output_format)
raise click.exceptions.Exit(0 if result.valid else 1)
@main.group("class")
def class_group() -> None:
"""Resolve deterministic content classes."""
@class_group.command("resolve")
@click.argument("class_file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.argument("class_name")
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def class_resolve(class_file: Path, class_name: str, output_format: str) -> None:
"""Resolve content class inheritance and merged slots."""
try:
registry = load_content_class_file(class_file)
result = registry.compose(class_name)
except ContentClassResolutionError as exc:
raise click.ClickException(str(exc)) from exc
_emit_content_class_result(result.to_dict(), output_format)
raise click.exceptions.Exit(0 if result.valid else 1)
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--output-dir",
type=click.Path(file_okay=False, path_type=Path),
help="Write tangled files under this directory. Omit for dry JSON/YAML/text output.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "text"], case_sensitive=False),
default="text",
show_default=True,
)
def tangle(file: Path, output_dir: Path | None, output_format: str) -> None:
"""Tangle named Markdown code chunks into target files."""
result = tangle_markdown(file.read_text(encoding="utf-8"), source_path=file)
data = result.to_dict()
if output_dir and result.valid:
data["written_files"] = write_tangle_files(result, output_dir)
_emit_tangle_result(data, output_format)
raise click.exceptions.Exit(0 if result.valid else 1)
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--output",
type=click.Path(dir_okay=False, path_type=Path),
help="Write woven Markdown to a file.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["markdown", "json", "yaml"], case_sensitive=False),
default="markdown",
show_default=True,
)
def weave(file: Path, output: Path | None, output_format: str) -> None:
"""Weave Markdown documentation with a deterministic chunk index."""
result = weave_markdown(file.read_text(encoding="utf-8"), source_path=file)
_emit_markdown_result(result.to_dict(), output_format, output)
@main.group()
def cache() -> None:
"""Fingerprint Markdown files and detect changed inputs."""
@@ -788,6 +1023,83 @@ def _emit_cache_data(data: dict, output_format: str) -> None:
click.echo(f"written: {data['written']}")
def _emit_reference_result(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
elif output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
else:
click.echo(f"{data['count']} unit(s)")
click.echo(f"target: {data['target_path']}")
for unit in data["units"]:
span = unit.get("span", {})
line = f":{span['line_start']}" if span.get("line_start") else ""
click.echo(f"- {unit['kind']} {unit['unit_id']} {unit['source_path']}{line}")
if unit.get("name"):
click.echo(f" {unit['name']}")
def _emit_explode_result(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
elif output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
else:
manifest = data["manifest"]
click.echo(f"manifest: {data['manifest_path']}")
click.echo(f"variant: {manifest['variant']}")
click.echo(f"entries: {len(manifest['entries'])}")
for entry in manifest["entries"]:
click.echo(f"- {entry['kind']} {entry['file']}")
def _emit_processor_run(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
elif output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
else:
click.echo("valid" if data["valid"] else "invalid")
click.echo(f"processors: {data['count']}")
for block, result in zip(data["blocks"], data["results"], strict=False):
line = f":{block['line_start']}" if block.get("line_start") else ""
click.echo(f"- {block['processor']} {block['unit_id']}{line}")
if result.get("content"):
click.echo(f" content: {result['content'].splitlines()[0]}")
for diagnostic in result.get("diagnostics", []):
click.echo(f" [{diagnostic['severity']}] {diagnostic['code']}: {diagnostic['message']}")
def _emit_content_class_result(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
elif output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
else:
click.echo("valid" if data["valid"] else "invalid")
click.echo("linearization: " + " -> ".join(data["linearization"]))
for slot, value in data.get("slots", {}).items():
click.echo(f"- {slot}: {value}")
for diagnostic in data.get("diagnostics", []):
click.echo(f"! [{diagnostic['severity']}] {diagnostic['code']}: {diagnostic['message']}")
def _emit_tangle_result(data: dict, output_format: str) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
elif output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
else:
click.echo("valid" if data["valid"] else "invalid")
click.echo(f"files: {len(data['files'])}")
for file in data["files"]:
click.echo(f"- {file['path']}: {', '.join(file['chunk_ids'])}")
for diagnostic in data.get("diagnostics", []):
click.echo(f"! [{diagnostic['severity']}] {diagnostic['code']}: {diagnostic['message']}")
for written in data.get("written_files", []):
click.echo(f"written: {written}")
def _emit_jsonish(data: dict, output_format: str) -> None:
if output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))

View File

@@ -0,0 +1,19 @@
"""Deterministic content class composition."""
from markitect_tool.content_class.engine import (
ClassCompositionResult,
ContentClass,
ContentClassRegistry,
ContentClassResolutionError,
load_content_class_file,
load_content_classes,
)
__all__ = [
"ClassCompositionResult",
"ContentClass",
"ContentClassRegistry",
"ContentClassResolutionError",
"load_content_class_file",
"load_content_classes",
]

View File

@@ -0,0 +1,225 @@
"""Small deterministic content class resolver."""
from __future__ import annotations
from copy import deepcopy
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
import yaml
from markitect_tool.diagnostics import Diagnostic
class ContentClassResolutionError(ValueError):
"""Raised when content class definitions cannot be loaded."""
@dataclass(frozen=True)
class ContentClass:
"""A data-defined content class."""
name: str
extends: list[str] = field(default_factory=list)
slots: dict[str, Any] = field(default_factory=dict)
merge_policies: dict[str, str] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
return {key: value for key, value in asdict(self).items() if value not in ({}, [], None)}
@dataclass(frozen=True)
class ClassCompositionResult:
"""Resolved content class slots plus diagnostics."""
class_name: str
linearization: list[str]
slots: dict[str, Any]
diagnostics: list[Diagnostic] = field(default_factory=list)
@property
def valid(self) -> bool:
return not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)
def to_dict(self) -> dict[str, Any]:
return {
"valid": self.valid,
"class_name": self.class_name,
"linearization": self.linearization,
"slots": self.slots,
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
}
class ContentClassRegistry:
"""Registry and resolver for content classes."""
def __init__(self, classes: dict[str, ContentClass] | None = None) -> None:
self.classes = classes or {}
def add(self, content_class: ContentClass) -> None:
self.classes[content_class.name] = content_class
def linearize(self, class_name: str) -> list[str]:
if class_name not in self.classes:
raise ContentClassResolutionError(f"Unknown content class `{class_name}`")
return self._linearize(class_name, [])
def compose(self, class_name: str) -> ClassCompositionResult:
diagnostics: list[Diagnostic] = []
try:
linearization = self.linearize(class_name)
except ContentClassResolutionError as exc:
return ClassCompositionResult(
class_name=class_name,
linearization=[],
slots={},
diagnostics=[
Diagnostic(
severity="error",
code="content_class.resolution_error",
message=str(exc),
)
],
)
slots: dict[str, Any] = {}
for name in reversed(linearization):
content_class = self.classes[name]
for slot, value in content_class.slots.items():
policy = content_class.merge_policies.get(slot, "replace")
try:
slots[slot] = _merge_slot(slots.get(slot), value, policy)
except ContentClassResolutionError as exc:
diagnostics.append(
Diagnostic(
severity="error",
code="content_class.merge_conflict",
message=str(exc),
details={"class": name, "slot": slot, "policy": policy},
)
)
return ClassCompositionResult(
class_name=class_name,
linearization=linearization,
slots=slots,
diagnostics=diagnostics,
)
def _linearize(self, class_name: str, stack: list[str]) -> list[str]:
if class_name in stack:
raise ContentClassResolutionError(
"Cyclic content class inheritance: " + " -> ".join(stack + [class_name])
)
content_class = self.classes[class_name]
parent_mros = [
self._linearize(parent, stack + [class_name])
for parent in content_class.extends
if _known_parent(parent, self.classes)
]
missing = [parent for parent in content_class.extends if parent not in self.classes]
if missing:
raise ContentClassResolutionError(
f"Content class `{class_name}` extends unknown class(es): {', '.join(missing)}"
)
return [class_name] + _c3_merge(parent_mros + [list(content_class.extends)])
def load_content_class_file(path: str | Path) -> ContentClassRegistry:
"""Load content class definitions from YAML."""
data = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
if not isinstance(data, dict):
raise ContentClassResolutionError("Content class file must be a mapping")
return load_content_classes(data)
def load_content_classes(data: dict[str, Any]) -> ContentClassRegistry:
"""Load content class definitions from a mapping."""
raw_classes = data.get("classes", data)
if not isinstance(raw_classes, dict):
raise ContentClassResolutionError("Content classes must be a mapping")
classes: dict[str, ContentClass] = {}
for name, raw_class in raw_classes.items():
if not isinstance(raw_class, dict):
raise ContentClassResolutionError(f"Content class `{name}` must be a mapping")
extends = raw_class.get("extends", [])
if isinstance(extends, str):
extends = [extends]
if not isinstance(extends, list):
raise ContentClassResolutionError(f"Content class `{name}` extends must be a list")
slots = raw_class.get("slots", {})
policies = raw_class.get("merge_policies", {})
if not isinstance(slots, dict) or not isinstance(policies, dict):
raise ContentClassResolutionError(
f"Content class `{name}` slots and merge_policies must be mappings"
)
classes[str(name)] = ContentClass(
name=str(name),
extends=[str(parent) for parent in extends],
slots=slots,
merge_policies={str(key): str(value) for key, value in policies.items()},
)
return ContentClassRegistry(classes)
def _c3_merge(sequences: list[list[str]]) -> list[str]:
result: list[str] = []
sequences = [list(sequence) for sequence in sequences if sequence]
while sequences:
candidate = None
for sequence in sequences:
head = sequence[0]
if not any(head in other[1:] for other in sequences):
candidate = head
break
if candidate is None:
raise ContentClassResolutionError("Inconsistent content class precedence order")
result.append(candidate)
sequences = [
[item for item in sequence if item != candidate]
for sequence in sequences
]
sequences = [sequence for sequence in sequences if sequence]
return result
def _merge_slot(existing: Any, value: Any, policy: str) -> Any:
incoming = deepcopy(value)
if existing is None:
return incoming
if policy == "replace":
return incoming
if policy == "append":
return _as_list(existing) + _as_list(incoming)
if policy == "prepend":
return _as_list(incoming) + _as_list(existing)
if policy == "deep_merge":
if not isinstance(existing, dict) or not isinstance(incoming, dict):
raise ContentClassResolutionError("deep_merge requires mapping values")
return _deep_merge(existing, incoming)
if policy == "error_on_conflict":
if existing != incoming:
raise ContentClassResolutionError("slot conflict")
return existing
raise ContentClassResolutionError(f"Unknown merge policy `{policy}`")
def _deep_merge(left: dict[str, Any], right: dict[str, Any]) -> dict[str, Any]:
merged = deepcopy(left)
for key, value in right.items():
if isinstance(merged.get(key), dict) and isinstance(value, dict):
merged[key] = _deep_merge(merged[key], value)
else:
merged[key] = deepcopy(value)
return merged
def _as_list(value: Any) -> list[Any]:
return value if isinstance(value, list) else [value]
def _known_parent(parent: str, classes: dict[str, ContentClass]) -> bool:
return parent in classes

View File

@@ -0,0 +1,25 @@
"""Reversible explode/implode operations for Markdown documents."""
from markitect_tool.explode.engine import (
EXPLODE_MANIFEST_NAME,
ExplodeEntry,
ExplodeError,
ExplodeManifest,
ExplodeResult,
ImplodeResult,
explode_markdown_file,
implode_markdown_directory,
load_explode_manifest,
)
__all__ = [
"EXPLODE_MANIFEST_NAME",
"ExplodeEntry",
"ExplodeError",
"ExplodeManifest",
"ExplodeResult",
"ImplodeResult",
"explode_markdown_file",
"implode_markdown_directory",
"load_explode_manifest",
]

View File

@@ -0,0 +1,324 @@
"""Manifest-first reversible explode/implode for Markdown files."""
from __future__ import annotations
import hashlib
import re
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
import yaml
from markitect_tool.core import Heading, parse_markdown
EXPLODE_MANIFEST_NAME = "markitect-explode.yaml"
class ExplodeError(ValueError):
"""Raised when explode or implode cannot preserve a safe roundtrip."""
@dataclass(frozen=True)
class ExplodeEntry:
"""One file entry in an exploded Markdown directory."""
kind: str
file: str
order: int
unit_id: str
line_start: int
line_end: int
heading_level: int | None = None
heading_text: str | None = None
content_hash: str = ""
def to_dict(self) -> dict[str, Any]:
return {key: value for key, value in asdict(self).items() if value is not None}
@dataclass(frozen=True)
class ExplodeManifest:
"""Manifest used to implode an exploded Markdown directory."""
version: int
source_path: str
source_hash: str
variant: str
frontmatter_raw: str = ""
entries: list[ExplodeEntry] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return {
"version": self.version,
"source_path": self.source_path,
"source_hash": self.source_hash,
"variant": self.variant,
"frontmatter_raw": self.frontmatter_raw,
"entries": [entry.to_dict() for entry in self.entries],
}
@dataclass(frozen=True)
class ExplodeResult:
"""Result of exploding a Markdown file into a directory."""
manifest_path: str
output_dir: str
manifest: ExplodeManifest
written_files: list[str]
def to_dict(self) -> dict[str, Any]:
return {
"manifest_path": self.manifest_path,
"output_dir": self.output_dir,
"manifest": self.manifest.to_dict(),
"written_files": self.written_files,
}
@dataclass(frozen=True)
class ImplodeResult:
"""Result of rebuilding Markdown from an explode manifest."""
markdown: str
manifest_path: str
source_hash: str
current_hash: str
entries: list[str]
def to_dict(self) -> dict[str, Any]:
return asdict(self)
def explode_markdown_file(
path: str | Path,
output_dir: str | Path,
*,
variant: str = "flat",
overwrite: bool = False,
) -> ExplodeResult:
"""Explode a Markdown file into section files plus a roundtrip manifest."""
if variant not in {"flat", "hierarchical"}:
raise ExplodeError("Explode variant must be `flat` or `hierarchical`")
source_path = Path(path)
target_dir = Path(output_dir)
markdown = source_path.read_text(encoding="utf-8")
if target_dir.exists() and any(target_dir.iterdir()) and not overwrite:
raise ExplodeError(f"Output directory is not empty: {target_dir}")
target_dir.mkdir(parents=True, exist_ok=True)
frontmatter_raw, body_start_line = _split_frontmatter_raw(markdown)
entries_with_text = _explode_entries(markdown, body_start_line, variant)
written_files: list[str] = []
entries: list[ExplodeEntry] = []
for entry, text in entries_with_text:
entry_path = _safe_entry_path(target_dir, entry.file)
entry_path.parent.mkdir(parents=True, exist_ok=True)
entry_path.write_text(text, encoding="utf-8")
written_files.append(str(entry_path))
entries.append(entry)
manifest = ExplodeManifest(
version=1,
source_path=str(source_path),
source_hash=_hash_text(markdown),
variant=variant,
frontmatter_raw=frontmatter_raw,
entries=entries,
)
manifest_path = target_dir / EXPLODE_MANIFEST_NAME
manifest_path.write_text(yaml.safe_dump(manifest.to_dict(), sort_keys=False), encoding="utf-8")
return ExplodeResult(
manifest_path=str(manifest_path),
output_dir=str(target_dir),
manifest=manifest,
written_files=written_files + [str(manifest_path)],
)
def implode_markdown_directory(
directory: str | Path,
*,
manifest_path: str | Path | None = None,
) -> ImplodeResult:
"""Implode a Markdown directory created by :func:`explode_markdown_file`."""
root = Path(directory)
manifest_file = Path(manifest_path) if manifest_path else root / EXPLODE_MANIFEST_NAME
manifest = load_explode_manifest(manifest_file)
parts = [manifest.frontmatter_raw]
entry_files: list[str] = []
for entry in manifest.entries:
entry_path = _safe_entry_path(root, entry.file)
if not entry_path.exists() or not entry_path.is_file():
raise ExplodeError(f"Exploded entry file not found: {entry.file}")
parts.append(entry_path.read_text(encoding="utf-8"))
entry_files.append(str(entry_path))
markdown = "".join(parts)
return ImplodeResult(
markdown=markdown,
manifest_path=str(manifest_file),
source_hash=manifest.source_hash,
current_hash=_hash_text(markdown),
entries=entry_files,
)
def load_explode_manifest(path: str | Path) -> ExplodeManifest:
"""Load an explode manifest from YAML."""
manifest_path = Path(path)
data = yaml.safe_load(manifest_path.read_text(encoding="utf-8"))
if not isinstance(data, dict):
raise ExplodeError("Explode manifest must be a mapping")
entries = data.get("entries", [])
if not isinstance(entries, list):
raise ExplodeError("Explode manifest entries must be a list")
return ExplodeManifest(
version=int(data.get("version", 1)),
source_path=str(data.get("source_path", "")),
source_hash=str(data.get("source_hash", "")),
variant=str(data.get("variant", "flat")),
frontmatter_raw=str(data.get("frontmatter_raw", "")),
entries=[_entry_from_mapping(entry) for entry in entries],
)
def _explode_entries(
markdown: str,
body_start_line: int,
variant: str,
) -> list[tuple[ExplodeEntry, str]]:
lines = markdown.splitlines(keepends=True)
headings = parse_markdown(markdown).headings
entries: list[tuple[ExplodeEntry, str]] = []
used_ids: dict[str, int] = {}
order = 0
first_heading_line = headings[0].line if headings else len(lines) + 1
preamble_text = "".join(lines[body_start_line - 1:first_heading_line - 1])
if preamble_text or not headings:
entry = ExplodeEntry(
kind="preamble",
file="00-preamble.md",
order=order,
unit_id="preamble",
line_start=body_start_line,
line_end=max(first_heading_line - 1, body_start_line),
content_hash=_hash_text(preamble_text),
)
entries.append((entry, preamble_text))
order += 1
hierarchy: dict[int, str] = {}
for index, heading in enumerate(headings):
start = heading.line
end = headings[index + 1].line - 1 if index + 1 < len(headings) else len(lines)
text = "".join(lines[start - 1:end])
unit_id = _dedupe_id(_slug(_heading_title(heading)), used_ids)
file_path = _entry_file_for_heading(heading, index + 1, unit_id, variant, hierarchy)
entry = ExplodeEntry(
kind="section",
file=file_path,
order=order,
unit_id=unit_id,
line_start=start,
line_end=end,
heading_level=heading.level,
heading_text=heading.text,
content_hash=_hash_text(text),
)
entries.append((entry, text))
order += 1
return entries
def _entry_file_for_heading(
heading: Heading,
index: int,
unit_id: str,
variant: str,
hierarchy: dict[int, str],
) -> str:
filename = f"{index:02d}-{unit_id}.md"
if variant == "flat":
return f"sections/{filename}"
for level in list(hierarchy):
if level >= heading.level:
del hierarchy[level]
parents = [hierarchy[level] for level in sorted(hierarchy) if level < heading.level]
hierarchy[heading.level] = f"{index:02d}-{unit_id}"
return str(Path(*parents, filename)) if parents else filename
def _entry_from_mapping(data: Any) -> ExplodeEntry:
if not isinstance(data, dict):
raise ExplodeError("Explode manifest entry must be a mapping")
return ExplodeEntry(
kind=str(data["kind"]),
file=str(data["file"]),
order=int(data["order"]),
unit_id=str(data["unit_id"]),
line_start=int(data["line_start"]),
line_end=int(data["line_end"]),
heading_level=int(data["heading_level"]) if data.get("heading_level") is not None else None,
heading_text=str(data["heading_text"]) if data.get("heading_text") is not None else None,
content_hash=str(data.get("content_hash", "")),
)
def _safe_entry_path(root: Path, relative_path: str) -> Path:
path = Path(relative_path)
if path.is_absolute():
raise ExplodeError(f"Exploded entry path must be relative: {relative_path}")
resolved = (root / path).resolve()
try:
resolved.relative_to(root.resolve())
except ValueError as exc:
raise ExplodeError(f"Exploded entry path escapes directory: {relative_path}") from exc
return resolved
def _split_frontmatter_raw(markdown: str) -> tuple[str, int]:
if not markdown.startswith("---\n"):
return "", 1
end = markdown.find("\n---", 4)
if end == -1:
return "", 1
closing_end = markdown.find("\n", end + 4)
if closing_end == -1:
closing_end = len(markdown)
else:
closing_end += 1
frontmatter_raw = markdown[:closing_end]
return frontmatter_raw, frontmatter_raw.count("\n") + 1
def _heading_title(heading: Heading) -> str:
text = re.sub(r"\s+\{#[A-Za-z0-9_.:-]+\}\s*$", "", heading.text.strip())
return text or "section"
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
count = used_ids.get(unit_id, 0) + 1
used_ids[unit_id] = count
return unit_id if count == 1 else f"{unit_id}-{count}"
def _slug(value: str) -> str:
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
slug = re.sub(r"-+", "-", slug).strip("-")
return slug or "section"
def _hash_text(text: str) -> str:
return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()

View File

@@ -0,0 +1,23 @@
"""Markdown-native literate weave/tangle workflows."""
from markitect_tool.literate.engine import (
CodeChunk,
LiterateFile,
TangleResult,
WeaveResult,
discover_code_chunks,
tangle_markdown,
weave_markdown,
write_tangle_files,
)
__all__ = [
"CodeChunk",
"LiterateFile",
"TangleResult",
"WeaveResult",
"discover_code_chunks",
"tangle_markdown",
"weave_markdown",
"write_tangle_files",
]

View File

@@ -0,0 +1,317 @@
"""Literate programming helpers for Markdown fenced code chunks."""
from __future__ import annotations
import hashlib
import re
import shlex
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
from markdown_it import MarkdownIt
from markitect_tool.diagnostics import Diagnostic, SourceLocation
from markitect_tool.ops import OperationProvenance
@dataclass(frozen=True)
class CodeChunk:
"""A named fenced code chunk."""
chunk_id: str
content: str
language: str | None = None
target_path: str | None = None
references: list[str] = field(default_factory=list)
source_path: str | None = None
line_start: int | None = None
line_end: int | None = None
content_hash: str = ""
def to_dict(self) -> dict[str, Any]:
return {key: value for key, value in asdict(self).items() if value not in (None, [], "")}
@dataclass(frozen=True)
class LiterateFile:
"""One generated file from tangling."""
path: str
content: str
chunk_ids: list[str]
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@dataclass(frozen=True)
class TangleResult:
"""Result of tangling Markdown code chunks."""
files: list[LiterateFile]
chunks: list[CodeChunk]
diagnostics: list[Diagnostic] = field(default_factory=list)
provenance: list[OperationProvenance] = field(default_factory=list)
@property
def valid(self) -> bool:
return not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)
def to_dict(self) -> dict[str, Any]:
return {
"valid": self.valid,
"files": [file.to_dict() for file in self.files],
"chunks": [chunk.to_dict() for chunk in self.chunks],
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
"provenance": [event.to_dict() for event in self.provenance],
}
@dataclass(frozen=True)
class WeaveResult:
"""Result of weaving Markdown documentation with a chunk index."""
markdown: str
chunks: list[CodeChunk]
def to_dict(self) -> dict[str, Any]:
return {
"markdown": self.markdown,
"chunks": [chunk.to_dict() for chunk in self.chunks],
}
_CHUNK_REF_RE = re.compile(r"<<(?P<id>[A-Za-z0-9_.:-]+)>>")
_CHUNK_LINE_REF_RE = re.compile(r"^(?P<indent>[ \t]*)<<(?P<id>[A-Za-z0-9_.:-]+)>>[ \t]*$", re.MULTILINE)
def discover_code_chunks(
markdown: str,
*,
source_path: str | Path | None = None,
) -> list[CodeChunk]:
"""Discover named fenced code chunks in Markdown order."""
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
chunks: list[CodeChunk] = []
used_ids: dict[str, int] = {}
for token in parser.parse(markdown):
if token.type != "fence":
continue
attrs = _parse_fence_info(token.info)
chunk_id = attrs.get("id")
if not chunk_id:
continue
chunk_id = _dedupe_id(_slug(chunk_id), used_ids)
line_start = token.map[0] + 1 if token.map else None
line_end = token.map[1] if token.map else None
chunks.append(
CodeChunk(
chunk_id=chunk_id,
content=token.content,
language=attrs.get("language"),
target_path=attrs.get("tangle") or attrs.get("target"),
references=_chunk_references(token.content),
source_path=str(source_path) if source_path else None,
line_start=line_start,
line_end=line_end,
content_hash=_hash_text(token.content),
)
)
return chunks
def tangle_markdown(
markdown: str,
*,
source_path: str | Path | None = None,
) -> TangleResult:
"""Tangle named chunks into target files."""
chunks = discover_code_chunks(markdown, source_path=source_path)
chunks_by_id = {chunk.chunk_id: chunk for chunk in chunks}
diagnostics: list[Diagnostic] = []
provenance: list[OperationProvenance] = []
target_chunks: dict[str, list[CodeChunk]] = {}
for chunk in chunks:
if chunk.target_path:
target_chunks.setdefault(chunk.target_path, []).append(chunk)
files: list[LiterateFile] = []
for target_path, grouped_chunks in target_chunks.items():
rendered_parts: list[str] = []
for chunk in grouped_chunks:
rendered_parts.append(_expand_chunk(chunk, chunks_by_id, diagnostics, []))
provenance.append(
OperationProvenance(
operation="literate.tangle",
source_path=chunk.source_path,
line_start=chunk.line_start,
line_end=chunk.line_end,
target_path=target_path,
dependencies=[chunk.source_path] if chunk.source_path else [],
metadata={"chunk_id": chunk.chunk_id, "references": chunk.references},
)
)
files.append(
LiterateFile(
path=target_path,
content=_join_tangled_parts(rendered_parts),
chunk_ids=[chunk.chunk_id for chunk in grouped_chunks],
)
)
return TangleResult(
files=files,
chunks=chunks,
diagnostics=diagnostics,
provenance=provenance,
)
def weave_markdown(
markdown: str,
*,
source_path: str | Path | None = None,
) -> WeaveResult:
"""Append a deterministic chunk index to human-readable Markdown."""
chunks = discover_code_chunks(markdown, source_path=source_path)
if not chunks:
return WeaveResult(markdown=markdown, chunks=[])
lines = [markdown.rstrip(), "", "## Code Chunk Index", ""]
for chunk in chunks:
target = f" -> `{chunk.target_path}`" if chunk.target_path else ""
refs = f"; refs: {', '.join(f'`{ref}`' for ref in chunk.references)}" if chunk.references else ""
location = f" line {chunk.line_start}" if chunk.line_start else ""
lines.append(f"- `{chunk.chunk_id}`{target}{refs}{location}")
return WeaveResult(markdown="\n".join(lines).rstrip() + "\n", chunks=chunks)
def write_tangle_files(result: TangleResult, output_dir: str | Path) -> list[str]:
"""Write tangled files under an output directory."""
root = Path(output_dir)
root.mkdir(parents=True, exist_ok=True)
written: list[str] = []
for file in result.files:
target = _safe_output_path(root, file.path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(file.content, encoding="utf-8")
written.append(str(target))
return written
def _expand_chunk(
chunk: CodeChunk,
chunks_by_id: dict[str, CodeChunk],
diagnostics: list[Diagnostic],
stack: list[str],
) -> str:
if chunk.chunk_id in stack:
diagnostics.append(
Diagnostic(
severity="error",
code="literate.chunk_cycle",
message="Cyclic chunk reference: " + " -> ".join(stack + [chunk.chunk_id]),
source=SourceLocation(path=chunk.source_path, line=chunk.line_start),
)
)
return f"<<{chunk.chunk_id}>>"
def replace_line(match: re.Match[str]) -> str:
indent = match.group("indent")
expanded = _expand_reference(match.group("id"), chunks_by_id, diagnostics, stack + [chunk.chunk_id], chunk)
return "\n".join(f"{indent}{line}" if line else line for line in expanded.splitlines())
rendered = _CHUNK_LINE_REF_RE.sub(replace_line, chunk.content)
def replace_inline(match: re.Match[str]) -> str:
return _expand_reference(match.group("id"), chunks_by_id, diagnostics, stack + [chunk.chunk_id], chunk)
return _CHUNK_REF_RE.sub(replace_inline, rendered)
def _expand_reference(
chunk_id: str,
chunks_by_id: dict[str, CodeChunk],
diagnostics: list[Diagnostic],
stack: list[str],
source_chunk: CodeChunk,
) -> str:
referenced = chunks_by_id.get(chunk_id)
if not referenced:
diagnostics.append(
Diagnostic(
severity="error",
code="literate.missing_chunk",
message=f"Missing chunk reference `{chunk_id}`",
source=SourceLocation(path=source_chunk.source_path, line=source_chunk.line_start),
)
)
return f"<<{chunk_id}>>"
return _expand_chunk(referenced, chunks_by_id, diagnostics, stack)
def _join_tangled_parts(parts: list[str]) -> str:
rendered = "\n".join(part.rstrip("\n") for part in parts if part is not None)
return rendered.rstrip() + "\n" if rendered else ""
def _safe_output_path(root: Path, relative_path: str) -> Path:
path = Path(relative_path)
if path.is_absolute():
raise ValueError(f"Tangle target must be relative: {relative_path}")
resolved = (root / path).resolve()
try:
resolved.relative_to(root.resolve())
except ValueError as exc:
raise ValueError(f"Tangle target escapes output directory: {relative_path}") from exc
return resolved
def _parse_fence_info(info: str) -> dict[str, str]:
match = re.match(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$", info.strip())
if not match:
return {"language": info.strip()} if info.strip() else {}
attrs = _parse_attrs(match.group("attrs") or "")
language = match.group("language")
if language:
attrs["language"] = language
return attrs
def _parse_attrs(raw: str) -> dict[str, str]:
attrs: dict[str, str] = {}
for part in shlex.split(raw):
if part.startswith("#") and len(part) > 1:
attrs["id"] = part[1:]
continue
if "=" not in part:
attrs[part] = "true"
continue
key, value = part.split("=", 1)
attrs[key.strip()] = value.strip()
return attrs
def _chunk_references(content: str) -> list[str]:
return [match.group("id") for match in _CHUNK_REF_RE.finditer(content)]
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
count = used_ids.get(unit_id, 0) + 1
used_ids[unit_id] = count
return unit_id if count == 1 else f"{unit_id}-{count}"
def _slug(value: str) -> str:
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
slug = re.sub(r"-+", "-", slug).strip("-")
return slug or "chunk"
def _hash_text(text: str) -> str:
return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()

View File

@@ -4,6 +4,7 @@ from markitect_tool.ops.engine import (
ComposeResult,
IncludeError,
IncludeResult,
OperationProvenance,
TransformResult,
compose_files,
resolve_includes,
@@ -14,6 +15,7 @@ __all__ = [
"ComposeResult",
"IncludeError",
"IncludeResult",
"OperationProvenance",
"TransformResult",
"compose_files",
"resolve_includes",

View File

@@ -9,6 +9,7 @@ from pathlib import Path
from typing import Any
import yaml
from markdown_it import MarkdownIt
from markitect_tool.core import parse_markdown
from markitect_tool.query import extract_document
@@ -18,15 +19,46 @@ class IncludeError(ValueError):
"""Raised when include resolution cannot continue."""
@dataclass(frozen=True)
class OperationProvenance:
"""Structured provenance for deterministic Markdown operations."""
operation: str
source_path: str | None = None
line_start: int | None = None
line_end: int | None = None
target_path: str | None = None
dependencies: list[str] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
data = {
"operation": self.operation,
"source_path": self.source_path,
"line_start": self.line_start,
"line_end": self.line_end,
"target_path": self.target_path,
"dependencies": self.dependencies or None,
"metadata": self.metadata or None,
}
return {key: value for key, value in data.items() if value is not None}
@dataclass(frozen=True)
class TransformResult:
"""Result of a deterministic Markdown transform."""
markdown: str
operations: list[str] = field(default_factory=list)
provenance: list[OperationProvenance] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return asdict(self)
data: dict[str, Any] = {
"markdown": self.markdown,
"operations": self.operations,
"provenance": [event.to_dict() for event in self.provenance],
}
return {key: value for key, value in data.items() if value}
@dataclass(frozen=True)
@@ -46,9 +78,15 @@ class IncludeResult:
markdown: str
included_paths: list[str] = field(default_factory=list)
provenance: list[OperationProvenance] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return asdict(self)
data: dict[str, Any] = {
"markdown": self.markdown,
"included_paths": self.included_paths,
"provenance": [event.to_dict() for event in self.provenance],
}
return {key: value for key, value in data.items() if value}
_COMMENT_INCLUDE_RE = re.compile(r"<!--\s*mkt:include\s+(?P<attrs>.*?)\s*-->", re.DOTALL)
@@ -68,15 +106,30 @@ def transform_markdown(
"""Apply deterministic operations to one Markdown document."""
operations: list[str] = []
provenance: list[OperationProvenance] = []
frontmatter, body = _split_frontmatter(markdown)
if set_frontmatter:
frontmatter = _deep_merge(frontmatter, set_frontmatter)
operations.append("set_frontmatter")
provenance.append(
OperationProvenance(
operation="set_frontmatter",
source_path=source_path,
metadata={"keys": sorted(set_frontmatter.keys())},
)
)
if heading_delta:
body = shift_heading_levels(body, heading_delta)
body, affected_lines = _shift_heading_levels(body, heading_delta)
operations.append(f"shift_headings:{heading_delta}")
provenance.append(
OperationProvenance(
operation="shift_headings",
source_path=source_path,
metadata={"delta": heading_delta, "affected_lines": affected_lines},
)
)
if extract_selector:
document_text = _join_frontmatter(frontmatter, body) if frontmatter else body
@@ -84,24 +137,71 @@ def transform_markdown(
body = "\n\n".join(extract_document(document, extract_selector))
frontmatter = {}
operations.append(f"extract:{extract_selector}")
provenance.append(
OperationProvenance(
operation="extract",
source_path=source_path,
metadata={"selector": extract_selector},
)
)
if strip_frontmatter:
frontmatter = {}
operations.append("strip_frontmatter")
provenance.append(
OperationProvenance(
operation="strip_frontmatter",
source_path=source_path,
)
)
return TransformResult(markdown=_join_frontmatter(frontmatter, body), operations=operations)
return TransformResult(
markdown=_join_frontmatter(frontmatter, body),
operations=operations,
provenance=provenance,
)
def shift_heading_levels(markdown: str, delta: int) -> str:
"""Shift ATX heading levels by delta while clamping to levels 1 through 6."""
def replace(match: re.Match[str]) -> str:
shifted, _affected_lines = _shift_heading_levels(markdown, delta)
return shifted
def _shift_heading_levels(markdown: str, delta: int) -> tuple[str, list[int]]:
ignored_lines = _code_line_numbers(markdown)
affected_lines: list[int] = []
rendered_lines: list[str] = []
for line_number, line in enumerate(markdown.splitlines(keepends=True), start=1):
if line_number in ignored_lines:
rendered_lines.append(line)
continue
line_body = line.rstrip("\r\n")
line_ending = line[len(line_body) :]
match = _HEADING_RE.match(line_body)
if not match:
rendered_lines.append(line)
continue
marks = match.group(1)
suffix = match.group(2)
level = min(max(len(marks) + delta, 1), 6)
return f"{'#' * level}{suffix}"
rendered_lines.append(f"{'#' * level}{suffix}{line_ending}")
affected_lines.append(line_number)
return _HEADING_RE.sub(replace, markdown)
return "".join(rendered_lines), affected_lines
def _code_line_numbers(markdown: str) -> set[int]:
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
ignored_lines: set[int] = set()
for token in parser.parse(markdown):
if token.type not in {"fence", "code_block"} or not token.map:
continue
start, end = token.map
ignored_lines.update(range(start + 1, end + 1))
return ignored_lines
def compose_files(
@@ -154,18 +254,22 @@ def resolve_includes(
root = Path(base_dir).resolve()
stack = [Path(current_path).resolve()] if current_path else []
included: list[Path] = []
provenance: list[OperationProvenance] = []
resolved = _resolve_include_text(
markdown,
root=root,
current_dir=Path(current_path).resolve().parent if current_path else root,
source_path=Path(current_path).resolve() if current_path else None,
stack=stack,
included=included,
provenance=provenance,
depth=0,
max_depth=max_depth,
)
return IncludeResult(
markdown=resolved,
included_paths=[str(path) for path in included],
provenance=provenance,
)
@@ -174,34 +278,73 @@ def _resolve_include_text(
*,
root: Path,
current_dir: Path,
source_path: Path | None,
stack: list[Path],
included: list[Path],
provenance: list[OperationProvenance],
depth: int,
max_depth: int,
) -> str:
if depth > max_depth:
raise IncludeError(f"Include depth exceeded max_depth={max_depth}")
def replace_comment(match: re.Match[str]) -> str:
attrs = _parse_include_attrs(match.group("attrs"))
return _render_include(attrs, root, current_dir, stack, included, depth, max_depth)
ignored_lines = _code_line_numbers(markdown)
rendered_lines: list[str] = []
def replace_brace(match: re.Match[str]) -> str:
attrs = {"path": match.group("path").strip()}
return _render_include(attrs, root, current_dir, stack, included, depth, max_depth)
for line_number, line in enumerate(markdown.splitlines(keepends=True), start=1):
if line_number in ignored_lines:
rendered_lines.append(line)
continue
markdown = _COMMENT_INCLUDE_RE.sub(replace_comment, markdown)
return _BRACE_INCLUDE_RE.sub(replace_brace, markdown)
def replace_comment(match: re.Match[str]) -> str:
attrs = _parse_include_attrs(match.group("attrs"))
return _render_include(
attrs,
root,
current_dir,
source_path,
stack,
included,
provenance,
depth,
max_depth,
marker_line=line_number,
)
def replace_brace(match: re.Match[str]) -> str:
attrs = {"path": match.group("path").strip()}
return _render_include(
attrs,
root,
current_dir,
source_path,
stack,
included,
provenance,
depth,
max_depth,
marker_line=line_number,
)
line = _COMMENT_INCLUDE_RE.sub(replace_comment, line)
line = _BRACE_INCLUDE_RE.sub(replace_brace, line)
rendered_lines.append(line)
return "".join(rendered_lines)
def _render_include(
attrs: dict[str, str],
root: Path,
current_dir: Path,
source_path: Path | None,
stack: list[Path],
included: list[Path],
provenance: list[OperationProvenance],
depth: int,
max_depth: int,
*,
marker_line: int,
) -> str:
raw_path = attrs.get("path")
if not raw_path:
@@ -228,12 +371,33 @@ def _render_include(
body = shift_heading_levels(body, heading_delta)
included.append(include_path)
provenance.append(
OperationProvenance(
operation="include",
source_path=str(source_path) if source_path else None,
line_start=marker_line,
line_end=marker_line,
target_path=str(include_path),
dependencies=[str(include_path)],
metadata={
key: value
for key, value in {
"selector": selector,
"heading_delta": heading_delta if heading_delta else None,
"include_frontmatter": attrs.get("include_frontmatter"),
}.items()
if value is not None
},
)
)
return _resolve_include_text(
body.strip(),
root=root,
current_dir=include_path.parent,
source_path=include_path,
stack=stack + [include_path],
included=included,
provenance=provenance,
depth=depth + 1,
max_depth=max_depth,
)

View File

@@ -0,0 +1,27 @@
"""Deterministic fenced-block processor registry."""
from markitect_tool.processor.engine import (
FencedProcessorBlock,
ProcessorContext,
ProcessorOutputFile,
ProcessorRegistry,
ProcessorRequest,
ProcessorResult,
ProcessorRun,
default_processor_registry,
discover_fenced_processors,
run_fenced_processors,
)
__all__ = [
"FencedProcessorBlock",
"ProcessorContext",
"ProcessorOutputFile",
"ProcessorRegistry",
"ProcessorRequest",
"ProcessorResult",
"ProcessorRun",
"default_processor_registry",
"discover_fenced_processors",
"run_fenced_processors",
]

View File

@@ -0,0 +1,374 @@
"""Processor API for deterministic fenced-block workflows."""
from __future__ import annotations
import hashlib
import re
import shlex
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any, Callable
from markdown_it import MarkdownIt
from markitect_tool.diagnostics import Diagnostic, SourceLocation
from markitect_tool.ops import OperationProvenance
from markitect_tool.reference import (
ReferenceContext,
ReferenceResolutionError,
resolve_reference,
)
ProcessorCallable = Callable[["ProcessorRequest"], "ProcessorResult"]
@dataclass(frozen=True)
class FencedProcessorBlock:
"""A fenced Markdown block that opted into processor handling."""
processor: str
content: str
unit_id: str
attrs: dict[str, str]
language: str | None = None
source_path: str | None = None
line_start: int | None = None
line_end: int | None = None
content_hash: str = ""
def to_dict(self) -> dict[str, Any]:
return {key: value for key, value in asdict(self).items() if value not in (None, {}, "")}
@dataclass(frozen=True)
class ProcessorContext:
"""Execution context passed to deterministic processors."""
root: Path = Path(".")
current_path: Path | None = None
namespaces: dict[str, str] = field(default_factory=dict)
variables: dict[str, Any] = field(default_factory=dict)
policy: dict[str, Any] = field(default_factory=dict)
def reference_context(self) -> ReferenceContext:
return ReferenceContext(
root=self.root,
current_path=self.current_path,
namespaces=self.namespaces,
)
def to_dict(self) -> dict[str, Any]:
data = {
"root": str(self.root),
"current_path": str(self.current_path) if self.current_path else None,
"namespaces": self.namespaces,
"variables": self.variables,
"policy": self.policy,
}
return {key: value for key, value in data.items() if value not in (None, {}, "")}
@dataclass(frozen=True)
class ProcessorRequest:
"""One processor invocation."""
block: FencedProcessorBlock
context: ProcessorContext
@dataclass(frozen=True)
class ProcessorOutputFile:
"""A generated file requested by a processor."""
path: str
content: str
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@dataclass(frozen=True)
class ProcessorResult:
"""Deterministic processor result envelope."""
content: str | None = None
files: list[ProcessorOutputFile] = field(default_factory=list)
diagnostics: list[Diagnostic] = field(default_factory=list)
dependencies: list[str] = field(default_factory=list)
provenance: list[OperationProvenance] = field(default_factory=list)
@property
def valid(self) -> bool:
return not any(diagnostic.severity == "error" for diagnostic in self.diagnostics)
def to_dict(self) -> dict[str, Any]:
data = {
"valid": self.valid,
"content": self.content,
"files": [file.to_dict() for file in self.files],
"diagnostics": [diagnostic.to_dict() for diagnostic in self.diagnostics],
"dependencies": self.dependencies,
"provenance": [event.to_dict() for event in self.provenance],
}
return {key: value for key, value in data.items() if value not in (None, [], {})}
@dataclass(frozen=True)
class ProcessorRun:
"""Results from running all processor blocks in a document."""
source_path: str | None
blocks: list[FencedProcessorBlock]
results: list[ProcessorResult]
@property
def valid(self) -> bool:
return all(result.valid for result in self.results)
def to_dict(self) -> dict[str, Any]:
return {
"valid": self.valid,
"source_path": self.source_path,
"count": len(self.results),
"blocks": [block.to_dict() for block in self.blocks],
"results": [result.to_dict() for result in self.results],
}
class ProcessorRegistry:
"""Explicit registry for deterministic fenced-block processors."""
def __init__(self) -> None:
self._processors: dict[str, ProcessorCallable] = {}
def register(self, name: str, processor: ProcessorCallable) -> None:
key = _slug(name)
if not key:
raise ValueError("Processor name cannot be empty")
self._processors[key] = processor
def names(self) -> list[str]:
return sorted(self._processors)
def run(self, request: ProcessorRequest) -> ProcessorResult:
processor = self._processors.get(_slug(request.block.processor))
if processor is None:
return ProcessorResult(
diagnostics=[
Diagnostic(
severity="error",
code="processor.unknown",
message=f"Unknown processor `{request.block.processor}`",
source=SourceLocation(
path=request.block.source_path,
line=request.block.line_start,
),
)
]
)
return processor(request)
def default_processor_registry() -> ProcessorRegistry:
"""Create the default deterministic processor registry."""
registry = ProcessorRegistry()
registry.register("identity", _identity_processor)
registry.register("uppercase", _uppercase_processor)
registry.register("include", _include_processor)
return registry
def discover_fenced_processors(
markdown: str,
*,
source_path: str | Path | None = None,
) -> list[FencedProcessorBlock]:
"""Discover fenced blocks that explicitly opt into processor handling."""
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
blocks: list[FencedProcessorBlock] = []
used_ids: dict[str, int] = {}
for index, token in enumerate(parser.parse(markdown)):
if token.type != "fence":
continue
attrs = _parse_fence_info(token.info)
processor = _processor_name(attrs)
if not processor:
continue
unit_id = _dedupe_id(_slug(attrs.get("id") or f"{processor}-{index}"), used_ids)
line_start = token.map[0] + 1 if token.map else None
line_end = token.map[1] if token.map else None
blocks.append(
FencedProcessorBlock(
processor=processor,
content=token.content,
unit_id=unit_id,
attrs={
key: value
for key, value in attrs.items()
if key not in {"id", "language", "processor"}
},
language=attrs.get("language"),
source_path=str(source_path) if source_path else None,
line_start=line_start,
line_end=line_end,
content_hash=_hash_text(token.content),
)
)
return blocks
def run_fenced_processors(
markdown: str,
*,
context: ProcessorContext,
registry: ProcessorRegistry | None = None,
source_path: str | Path | None = None,
) -> ProcessorRun:
"""Run all processor-marked fenced blocks in document order."""
active_registry = registry or default_processor_registry()
blocks = discover_fenced_processors(markdown, source_path=source_path or context.current_path)
results = [
active_registry.run(ProcessorRequest(block=block, context=context))
for block in blocks
]
return ProcessorRun(
source_path=str(source_path or context.current_path) if source_path or context.current_path else None,
blocks=blocks,
results=results,
)
def _identity_processor(request: ProcessorRequest) -> ProcessorResult:
return ProcessorResult(
content=request.block.content,
provenance=[
OperationProvenance(
operation="processor.identity",
source_path=request.block.source_path,
line_start=request.block.line_start,
line_end=request.block.line_end,
metadata={"unit_id": request.block.unit_id},
)
],
)
def _uppercase_processor(request: ProcessorRequest) -> ProcessorResult:
return ProcessorResult(
content=request.block.content.upper(),
provenance=[
OperationProvenance(
operation="processor.uppercase",
source_path=request.block.source_path,
line_start=request.block.line_start,
line_end=request.block.line_end,
metadata={"unit_id": request.block.unit_id},
)
],
)
def _include_processor(request: ProcessorRequest) -> ProcessorResult:
reference = request.block.attrs.get("ref")
if not reference:
return ProcessorResult(
diagnostics=[
Diagnostic(
severity="error",
code="processor.include.missing_ref",
message="Include processor requires a `ref` attribute",
source=SourceLocation(
path=request.block.source_path,
line=request.block.line_start,
),
)
]
)
try:
resolution = resolve_reference(reference, context=request.context.reference_context())
except ReferenceResolutionError as exc:
return ProcessorResult(
diagnostics=[
Diagnostic(
severity="error",
code="processor.include.reference_error",
message=str(exc),
source=SourceLocation(
path=request.block.source_path,
line=request.block.line_start,
),
)
]
)
content = "\n\n".join(unit.text for unit in resolution.units)
return ProcessorResult(
content=content,
dependencies=[resolution.target_path],
provenance=[
OperationProvenance(
operation="processor.include",
source_path=request.block.source_path,
line_start=request.block.line_start,
line_end=request.block.line_end,
target_path=resolution.target_path,
dependencies=[resolution.target_path],
metadata={"ref": reference, "unit_ids": [unit.unit_id for unit in resolution.units]},
)
],
)
def _processor_name(attrs: dict[str, str]) -> str | None:
if "processor" in attrs:
return attrs["processor"]
language = attrs.get("language", "")
if language.startswith("mkt-"):
return language.removeprefix("mkt-")
if language == "mkt" and "type" in attrs:
return attrs["type"]
return None
def _parse_fence_info(info: str) -> dict[str, str]:
match = re.match(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$", info.strip())
if not match:
return {"language": info.strip()} if info.strip() else {}
attrs = _parse_attrs(match.group("attrs") or "")
language = match.group("language")
if language:
attrs["language"] = language
return attrs
def _parse_attrs(raw: str) -> dict[str, str]:
attrs: dict[str, str] = {}
for part in shlex.split(raw):
if part.startswith("#") and len(part) > 1:
attrs["id"] = part[1:]
continue
if "=" not in part:
attrs[part] = "true"
continue
key, value = part.split("=", 1)
attrs[key.strip()] = value.strip()
return attrs
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
count = used_ids.get(unit_id, 0) + 1
used_ids[unit_id] = count
return unit_id if count == 1 else f"{unit_id}-{count}"
def _slug(value: str) -> str:
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
slug = re.sub(r"-+", "-", slug).strip("-")
return slug
def _hash_text(text: str) -> str:
return "sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest()

View File

@@ -0,0 +1,25 @@
"""Namespaced content reference resolution for Markdown artifacts."""
from markitect_tool.reference.engine import (
ContentUnit,
ReferenceAddress,
ReferenceContext,
ReferenceResolution,
ReferenceResolutionError,
SourceSpan,
load_namespaces,
parse_reference,
resolve_reference,
)
__all__ = [
"ContentUnit",
"ReferenceAddress",
"ReferenceContext",
"ReferenceResolution",
"ReferenceResolutionError",
"SourceSpan",
"load_namespaces",
"parse_reference",
"resolve_reference",
]

View File

@@ -0,0 +1,626 @@
"""Reference parsing and resolution for Markdown content units."""
from __future__ import annotations
import hashlib
import re
import shlex
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
from markdown_it import MarkdownIt
from markitect_tool.core import ContentBlock, Document, Heading, Section, parse_markdown
from markitect_tool.query import InvalidQueryError, QueryMatch, query_document
class ReferenceResolutionError(ValueError):
"""Raised when a content reference cannot be resolved."""
@dataclass(frozen=True)
class ReferenceAddress:
"""Parsed content reference address.
Syntax is intentionally compact and Markdown-friendly:
- ``path/to/file.md``
- ``std:clauses/payment.md``
- ``std:clauses/payment.md#section:terms``
- ``std:clauses/payment.md::sections[heading=Terms]``
- ``#intro`` for a fragment in the current document
"""
raw: str
namespace: str | None = None
address: str = ""
fragment: str | None = None
selector: str | None = None
def to_dict(self) -> dict[str, Any]:
return {
key: value
for key, value in asdict(self).items()
if value is not None and value != ""
}
@dataclass(frozen=True)
class ReferenceContext:
"""Inputs used to resolve namespaced and relative content references."""
root: Path = Path(".")
current_path: Path | None = None
namespaces: dict[str, str] = field(default_factory=dict)
@classmethod
def from_document(
cls,
document: Document,
*,
root: str | Path = ".",
current_path: str | Path | None = None,
) -> "ReferenceContext":
"""Build a reference context from document frontmatter."""
source_path = current_path or document.source_path
return cls(
root=Path(root),
current_path=Path(source_path) if source_path else None,
namespaces=load_namespaces(document.frontmatter),
)
def to_dict(self) -> dict[str, Any]:
data = {
"root": str(self.root),
"current_path": str(self.current_path) if self.current_path else None,
"namespaces": self.namespaces,
}
return {key: value for key, value in data.items() if value is not None}
@dataclass(frozen=True)
class SourceSpan:
"""Line span for a resolved unit in its source file."""
line_start: int | None = None
line_end: int | None = None
def to_dict(self) -> dict[str, Any]:
return {key: value for key, value in asdict(self).items() if value is not None}
@dataclass(frozen=True)
class ContentUnit:
"""One addressable content unit resolved from Markdown."""
kind: str
unit_id: str
text: str
source_path: str
span: SourceSpan | None = None
name: str | None = None
content_hash: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
data = {
"kind": self.kind,
"unit_id": self.unit_id,
"name": self.name,
"source_path": self.source_path,
"span": self.span.to_dict() if self.span else None,
"content_hash": self.content_hash,
"metadata": self.metadata or None,
"text": self.text,
}
return {key: value for key, value in data.items() if value is not None}
@dataclass(frozen=True)
class ReferenceResolution:
"""Resolved content reference and its dependency edge."""
reference: ReferenceAddress
source_path: str
target_path: str
units: list[ContentUnit]
def to_dict(self) -> dict[str, Any]:
return {
"reference": self.reference.to_dict(),
"source_path": self.source_path,
"target_path": self.target_path,
"count": len(self.units),
"units": [unit.to_dict() for unit in self.units],
}
_NAMESPACE_RE = re.compile(r"^(?P<namespace>[A-Za-z][A-Za-z0-9_.-]*):(?P<address>.*)$")
_HEADING_ID_RE = re.compile(r"^(?P<title>.*?)(?:\s+\{#(?P<id>[A-Za-z0-9_.:-]+)\})?$")
_REGION_OPEN_RE = re.compile(r"<!--\s*mkt:region\s+(?P<attrs>.*?)\s*-->")
_REGION_CLOSE_RE = re.compile(r"<!--\s*/mkt:region\s*-->")
_FENCE_ATTRS_RE = re.compile(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$")
def parse_reference(reference: str) -> ReferenceAddress:
"""Parse a compact Markitect content reference."""
raw = reference.strip()
if not raw:
raise ReferenceResolutionError("Reference cannot be empty")
selector: str | None = None
base = raw
if "::" in base:
base, selector = base.split("::", 1)
selector = selector.strip()
if not selector:
raise ReferenceResolutionError(f"Reference selector is empty in `{reference}`")
fragment: str | None = None
if "#" in base:
base, fragment = base.split("#", 1)
fragment = fragment.strip()
if not fragment:
raise ReferenceResolutionError(f"Reference fragment is empty in `{reference}`")
namespace: str | None = None
address = base.strip()
match = _NAMESPACE_RE.match(address)
if match and "/" not in match.group("namespace") and "\\" not in match.group("namespace"):
namespace = match.group("namespace")
address = match.group("address").strip()
return ReferenceAddress(
raw=raw,
namespace=namespace,
address=address,
fragment=fragment,
selector=selector,
)
def load_namespaces(frontmatter: dict[str, Any]) -> dict[str, str]:
"""Load namespace mappings from Markdown frontmatter."""
raw_namespaces = frontmatter.get("namespaces", {})
if raw_namespaces is None:
return {}
if not isinstance(raw_namespaces, dict):
raise ReferenceResolutionError("Frontmatter `namespaces` must be a mapping")
namespaces: dict[str, str] = {}
for raw_key, raw_value in raw_namespaces.items():
key = str(raw_key).strip().rstrip(":")
if not key:
raise ReferenceResolutionError("Namespace keys cannot be empty")
if not _NAMESPACE_RE.match(f"{key}:"):
raise ReferenceResolutionError(f"Invalid namespace key `{raw_key}`")
if not isinstance(raw_value, str):
raise ReferenceResolutionError(f"Namespace `{key}` must map to a string path")
value = raw_value.strip()
if not value:
raise ReferenceResolutionError(f"Namespace `{key}` cannot map to an empty path")
namespaces[key] = value
return namespaces
def resolve_reference(
reference: str | ReferenceAddress,
*,
context: ReferenceContext,
) -> ReferenceResolution:
"""Resolve a content reference to one or more content units."""
address = parse_reference(reference) if isinstance(reference, str) else reference
root = context.root.resolve()
source_path = context.current_path.resolve() if context.current_path else root
target_path = _resolve_target_path(address, context, root, source_path)
if not target_path.exists() or not target_path.is_file():
raise ReferenceResolutionError(f"Referenced file not found: {target_path}")
markdown = target_path.read_text(encoding="utf-8")
document = parse_markdown(markdown, source_path=str(target_path))
if address.selector and address.fragment:
raise ReferenceResolutionError("Reference cannot use both fragment and selector")
if address.selector:
units = _units_from_selector(document, address.selector, target_path)
elif address.fragment:
units = _units_from_fragment(document, address.fragment, target_path, markdown)
else:
units = [_document_unit(document, target_path, markdown)]
if not units:
raise ReferenceResolutionError(f"Reference `{address.raw}` did not match any content units")
return ReferenceResolution(
reference=address,
source_path=str(source_path),
target_path=str(target_path),
units=units,
)
def _resolve_target_path(
address: ReferenceAddress,
context: ReferenceContext,
root: Path,
source_path: Path,
) -> Path:
if address.namespace:
if address.namespace not in context.namespaces:
raise ReferenceResolutionError(f"Unknown namespace `{address.namespace}`")
namespace_target = _path_from_namespace(context.namespaces[address.namespace], root)
candidate = namespace_target / address.address if namespace_target.is_dir() else namespace_target
elif address.address:
base_dir = source_path.parent if source_path.is_file() else root
candidate = Path(address.address)
candidate = candidate if candidate.is_absolute() else base_dir / candidate
elif context.current_path:
candidate = context.current_path
else:
raise ReferenceResolutionError("Pathless references require a current document")
resolved = candidate.resolve()
try:
resolved.relative_to(root)
except ValueError as exc:
raise ReferenceResolutionError(f"Reference escapes root: {address.raw}") from exc
return resolved
def _path_from_namespace(raw_path: str, root: Path) -> Path:
path = Path(raw_path)
if not path.is_absolute():
path = root / path
return path.resolve()
def _units_from_selector(
document: Document,
selector: str,
target_path: Path,
) -> list[ContentUnit]:
try:
matches = query_document(document, selector)
except InvalidQueryError as exc:
raise ReferenceResolutionError(str(exc)) from exc
return [_unit_from_query_match(match, target_path) for match in matches]
def _units_from_fragment(
document: Document,
fragment: str,
target_path: Path,
markdown: str,
) -> list[ContentUnit]:
kind, _, value = fragment.partition(":")
if not value:
kind, value = "id", kind
lookup = _slug(value)
if kind == "document":
return [_document_unit(document, target_path, markdown)]
if kind == "id":
for units in [
_section_units(document, target_path),
_region_units(markdown, target_path),
_fenced_block_units(markdown, target_path),
_heading_units(document, target_path),
]:
matches = [
unit for unit in units if unit.unit_id == lookup or _slug(unit.name or "") == lookup
]
if matches:
return matches
return []
if kind in {"id", "section"}:
sections = _section_units(document, target_path)
return [unit for unit in sections if unit.unit_id == lookup or _slug(unit.name or "") == lookup]
if kind == "heading":
headings = _heading_units(document, target_path)
return [unit for unit in headings if unit.unit_id == lookup or _slug(unit.name or "") == lookup]
if kind == "block":
return _block_fragment_units(document, target_path, value)
if kind == "region":
return [unit for unit in _region_units(markdown, target_path) if unit.unit_id == lookup]
if kind == "fence":
return [unit for unit in _fenced_block_units(markdown, target_path) if unit.unit_id == lookup]
if kind == "tag":
return [
unit
for unit in _region_units(markdown, target_path) + _fenced_block_units(markdown, target_path)
if lookup in {_slug(tag) for tag in unit.metadata.get("tags", [])}
]
if kind == "line":
return _line_range_units(markdown, target_path, value)
raise ReferenceResolutionError(f"Unsupported reference fragment kind `{kind}`")
def _document_unit(document: Document, target_path: Path, markdown: str) -> ContentUnit:
unit_id = _slug(str(document.frontmatter.get("id") or target_path.stem))
return _content_unit(
kind="document",
unit_id=unit_id,
text=markdown,
source_path=target_path,
span=SourceSpan(1, len(markdown.splitlines())),
name=str(document.frontmatter.get("title") or target_path.stem),
metadata={"frontmatter": document.frontmatter},
)
def _unit_from_query_match(match: QueryMatch, target_path: Path) -> ContentUnit:
unit_id = _slug(match.path.replace("$.", "").replace("[", "-").replace("]", ""))
name = match.text.splitlines()[0].lstrip("# ").strip() if match.text else match.kind
return _content_unit(
kind=match.kind,
unit_id=unit_id,
text=match.text if match.text is not None else str(match.value),
source_path=target_path,
span=SourceSpan(match.line, None),
name=name,
metadata={"query_path": match.path, "value": match.value},
)
def _section_units(document: Document, target_path: Path) -> list[ContentUnit]:
used_ids: dict[str, int] = {}
return [
_section_unit(section, target_path, used_ids)
for section in document.sections
]
def _section_unit(
section: Section,
target_path: Path,
used_ids: dict[str, int],
) -> ContentUnit:
title, explicit_id = _heading_title_and_id(section.heading)
unit_id = _dedupe_id(_slug(explicit_id or title), used_ids)
line_end = section.blocks[-1].line_end if section.blocks else section.heading.line
lines = [f"{'#' * section.heading.level} {section.heading.text}"]
for block in section.blocks:
if block.text:
lines.extend(["", block.text])
return _content_unit(
kind="section",
unit_id=unit_id,
text="\n".join(lines).strip(),
source_path=target_path,
span=SourceSpan(section.heading.line, line_end),
name=title,
metadata={"heading_level": section.heading.level},
)
def _heading_units(document: Document, target_path: Path) -> list[ContentUnit]:
used_ids: dict[str, int] = {}
units: list[ContentUnit] = []
for heading in document.headings:
title, explicit_id = _heading_title_and_id(heading)
unit_id = _dedupe_id(_slug(explicit_id or title), used_ids)
units.append(
_content_unit(
kind="heading",
unit_id=unit_id,
text=f"{'#' * heading.level} {heading.text}",
source_path=target_path,
span=SourceSpan(heading.line, heading.line),
name=title,
metadata={"heading_level": heading.level},
)
)
return units
def _block_fragment_units(
document: Document,
target_path: Path,
value: str,
) -> list[ContentUnit]:
blocks = _block_units(document.blocks, target_path)
if value.isdigit():
index = int(value)
return [blocks[index]] if 0 <= index < len(blocks) else []
lookup = _slug(value)
return [unit for unit in blocks if unit.unit_id == lookup]
def _block_units(blocks: list[ContentBlock], target_path: Path) -> list[ContentUnit]:
used_ids: dict[str, int] = {}
units: list[ContentUnit] = []
for index, block in enumerate(blocks):
base_id = f"{block.type}-{block.line_start or index}"
units.append(
_content_unit(
kind=block.type,
unit_id=_dedupe_id(_slug(base_id), used_ids),
text=block.text,
source_path=target_path,
span=SourceSpan(block.line_start, block.line_end),
name=block.type,
metadata={"block_index": index},
)
)
return units
def _region_units(markdown: str, target_path: Path) -> list[ContentUnit]:
lines = markdown.splitlines()
units: list[ContentUnit] = []
open_region: tuple[int, str, list[str]] | None = None
for index, line in enumerate(lines, start=1):
open_match = _REGION_OPEN_RE.search(line)
close_match = _REGION_CLOSE_RE.search(line)
if open_match and open_region is not None:
raise ReferenceResolutionError("Nested mkt:region blocks are not supported")
if close_match:
if open_region is None:
raise ReferenceResolutionError("Region close marker has no matching open marker")
start_line, region_id, tags = open_region
content_lines = lines[start_line:index - 1]
units.append(
_content_unit(
kind="region",
unit_id=_slug(region_id),
text="\n".join(content_lines).strip(),
source_path=target_path,
span=SourceSpan(start_line, index),
name=region_id,
metadata={"tags": tags},
)
)
open_region = None
continue
if open_match:
attrs = _parse_attrs(open_match.group("attrs"))
region_id = attrs.get("id")
if not region_id:
raise ReferenceResolutionError("Region marker requires an id attribute")
open_region = (index, region_id, _tags_from_attrs(attrs))
if open_region is not None:
raise ReferenceResolutionError("Region open marker has no matching close marker")
return units
def _fenced_block_units(markdown: str, target_path: Path) -> list[ContentUnit]:
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
units: list[ContentUnit] = []
used_ids: dict[str, int] = {}
for index, token in enumerate(parser.parse(markdown)):
if token.type != "fence":
continue
attrs = _parse_fence_info(token.info)
unit_id = attrs.get("id")
if not unit_id:
continue
line_start = token.map[0] + 1 if token.map else None
line_end = token.map[1] if token.map else None
units.append(
_content_unit(
kind="fenced_block",
unit_id=_dedupe_id(_slug(unit_id), used_ids),
text=token.content,
source_path=target_path,
span=SourceSpan(line_start, line_end),
name=unit_id,
metadata={
"language": attrs.get("language"),
"tags": _tags_from_attrs(attrs),
"attrs": {
key: value
for key, value in attrs.items()
if key not in {"id", "language", "tag", "tags"}
},
"block_index": index,
},
)
)
return units
def _line_range_units(markdown: str, target_path: Path, value: str) -> list[ContentUnit]:
match = re.match(r"^(?P<start>\d+)(?:-(?P<end>\d+))?$", value)
if not match:
raise ReferenceResolutionError("Line fragments must use `line:start` or `line:start-end`")
start = int(match.group("start"))
end = int(match.group("end") or start)
lines = markdown.splitlines()
if start < 1 or end < start or end > len(lines):
return []
text = "\n".join(lines[start - 1:end])
return [
_content_unit(
kind="line_range",
unit_id=f"line-{start}-{end}",
text=text,
source_path=target_path,
span=SourceSpan(start, end),
name=f"lines {start}-{end}",
metadata={},
)
]
def _parse_fence_info(info: str) -> dict[str, str]:
match = _FENCE_ATTRS_RE.match(info.strip())
if not match:
return {"language": info.strip()} if info.strip() else {}
attrs = _parse_attrs(match.group("attrs") or "")
language = match.group("language")
if language:
attrs["language"] = language
if "id" not in attrs and attrs:
for key in list(attrs):
if key.startswith("#"):
attrs["id"] = key[1:]
del attrs[key]
break
return attrs
def _parse_attrs(raw: str) -> dict[str, str]:
attrs: dict[str, str] = {}
for part in shlex.split(raw):
if part.startswith("#") and len(part) > 1:
attrs["id"] = part[1:]
continue
if "=" not in part:
attrs[part] = "true"
continue
key, value = part.split("=", 1)
attrs[key.strip()] = value.strip()
return attrs
def _tags_from_attrs(attrs: dict[str, str]) -> list[str]:
raw = attrs.get("tags") or attrs.get("tag") or ""
return [tag.strip() for tag in re.split(r"[, ]+", raw) if tag.strip()]
def _content_unit(
*,
kind: str,
unit_id: str,
text: str,
source_path: Path,
span: SourceSpan | None,
name: str | None,
metadata: dict[str, Any] | None = None,
) -> ContentUnit:
return ContentUnit(
kind=kind,
unit_id=unit_id,
text=text,
source_path=str(source_path),
span=span,
name=name,
content_hash="sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest(),
metadata=metadata or {},
)
def _heading_title_and_id(heading: Heading) -> tuple[str, str | None]:
match = _HEADING_ID_RE.match(heading.text.strip())
if not match:
return heading.text.strip(), None
return match.group("title").strip(), match.group("id")
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
count = used_ids.get(unit_id, 0) + 1
used_ids[unit_id] = count
return unit_id if count == 1 else f"{unit_id}-{count}"
def _slug(value: str) -> str:
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
slug = re.sub(r"-+", "-", slug).strip("-")
return slug or "unit"