Deterministic ops layer and cli

This commit is contained in:
2026-05-04 00:23:04 +02:00
parent 6f0facd744
commit 274a7fcdd6
7 changed files with 778 additions and 2 deletions

View File

@@ -21,6 +21,15 @@ from markitect_tool.contract import (
validate_contract_file,
)
from markitect_tool.diagnostics import Diagnostic, SourceLocation
from markitect_tool.ops import (
ComposeResult,
IncludeError,
IncludeResult,
TransformResult,
compose_files,
resolve_includes,
transform_markdown,
)
from markitect_tool.query import (
InvalidQueryError,
QueryMatch,
@@ -61,6 +70,13 @@ __all__ = [
"validate_contract_file",
"Diagnostic",
"SourceLocation",
"ComposeResult",
"IncludeError",
"IncludeResult",
"TransformResult",
"compose_files",
"resolve_includes",
"transform_markdown",
"InvalidQueryError",
"QueryMatch",
"extract_document",

View File

@@ -16,6 +16,7 @@ from markitect_tool.contract import (
load_contract_file,
validate_contract,
)
from markitect_tool.ops import IncludeError, compose_files, resolve_includes, transform_markdown
from markitect_tool.query import InvalidQueryError, extract_document, query_document
from markitect_tool.schema import load_schema_file, validate_markdown_file, validate_schema
@@ -120,6 +121,160 @@ def extract(file: Path, selector: str, output_format: str) -> None:
_emit_extract(data, output_format)
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option("--strip-frontmatter", is_flag=True, help="Remove YAML frontmatter.")
@click.option(
"--set",
"set_values",
multiple=True,
metavar="KEY=VALUE",
help="Set a frontmatter value. Dot paths create nested mappings.",
)
@click.option(
"--heading-delta",
type=int,
default=0,
show_default=True,
help="Shift ATX heading levels, clamped to 1..6.",
)
@click.option("--extract", "extract_selector", help="Replace content with selector output.")
@click.option(
"--output",
type=click.Path(dir_okay=False, path_type=Path),
help="Write transformed Markdown to a file.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["markdown", "json", "yaml"], case_sensitive=False),
default="markdown",
show_default=True,
)
def transform(
file: Path,
strip_frontmatter: bool,
set_values: tuple[str, ...],
heading_delta: int,
extract_selector: str | None,
output: Path | None,
output_format: str,
) -> None:
"""Apply deterministic transforms to a Markdown file."""
try:
frontmatter_updates = _parse_key_value_options(set_values)
result = transform_markdown(
file.read_text(encoding="utf-8"),
strip_frontmatter=strip_frontmatter,
set_frontmatter=frontmatter_updates,
heading_delta=heading_delta,
extract_selector=extract_selector,
source_path=str(file),
)
except (InvalidQueryError, ValueError) as exc:
raise click.ClickException(str(exc)) from exc
_emit_markdown_result(result.to_dict(), output_format, output)
@main.command()
@click.argument(
"files",
nargs=-1,
required=True,
type=click.Path(exists=True, dir_okay=False, path_type=Path),
)
@click.option("--title", help="Add a top-level title before composed files.")
@click.option(
"--heading-delta",
type=int,
default=0,
show_default=True,
help="Shift heading levels in each input before composing.",
)
@click.option(
"--include-frontmatter",
is_flag=True,
help="Keep each input file's frontmatter in the composed body.",
)
@click.option(
"--output",
type=click.Path(dir_okay=False, path_type=Path),
help="Write composed Markdown to a file.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["markdown", "json", "yaml"], case_sensitive=False),
default="markdown",
show_default=True,
)
def compose(
files: tuple[Path, ...],
title: str | None,
heading_delta: int,
include_frontmatter: bool,
output: Path | None,
output_format: str,
) -> None:
"""Compose multiple Markdown files into one document."""
result = compose_files(
list(files),
title=title,
heading_delta=heading_delta,
include_frontmatter=include_frontmatter,
)
_emit_markdown_result(result.to_dict(), output_format, output)
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--base-dir",
type=click.Path(exists=True, file_okay=False, path_type=Path),
help="Directory includes must stay within. Defaults to the input file directory.",
)
@click.option(
"--max-depth",
type=int,
default=10,
show_default=True,
help="Maximum recursive include depth.",
)
@click.option(
"--output",
type=click.Path(dir_okay=False, path_type=Path),
help="Write resolved Markdown to a file.",
)
@click.option(
"--format",
"output_format",
type=click.Choice(["markdown", "json", "yaml"], case_sensitive=False),
default="markdown",
show_default=True,
)
def include(
file: Path,
base_dir: Path | None,
max_depth: int,
output: Path | None,
output_format: str,
) -> None:
"""Resolve Markdown include markers in a document."""
try:
result = resolve_includes(
file.read_text(encoding="utf-8"),
base_dir=base_dir or file.parent,
current_path=file,
max_depth=max_depth,
)
except IncludeError as exc:
raise click.ClickException(str(exc)) from exc
_emit_markdown_result(result.to_dict(), output_format, output)
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
@@ -292,5 +447,42 @@ def _emit_extract(data: dict, output_format: str) -> None:
click.echo("\n\n".join(data["items"]))
def _emit_markdown_result(data: dict, output_format: str, output: Path | None) -> None:
if output_format == "json":
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
return
if output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
return
markdown = data["markdown"]
if output:
output.write_text(markdown, encoding="utf-8")
else:
click.echo(markdown, nl=False)
def _parse_key_value_options(items: tuple[str, ...]) -> dict[str, object]:
values: dict[str, object] = {}
for item in items:
if "=" not in item:
raise ValueError(f"Expected KEY=VALUE, got `{item}`")
key, raw_value = item.split("=", 1)
key = key.strip()
if not key:
raise ValueError(f"Expected non-empty key in `{item}`")
_set_path(values, key.split("."), yaml.safe_load(raw_value))
return values
def _set_path(mapping: dict[str, object], path: list[str], value: object) -> None:
current = mapping
for part in path[:-1]:
next_value = current.setdefault(part, {})
if not isinstance(next_value, dict):
raise ValueError(f"Cannot set nested frontmatter path through scalar `{part}`")
current = next_value
current[path[-1]] = value
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,21 @@
"""Deterministic Markdown document operations."""
from markitect_tool.ops.engine import (
ComposeResult,
IncludeError,
IncludeResult,
TransformResult,
compose_files,
resolve_includes,
transform_markdown,
)
__all__ = [
"ComposeResult",
"IncludeError",
"IncludeResult",
"TransformResult",
"compose_files",
"resolve_includes",
"transform_markdown",
]

View File

@@ -0,0 +1,300 @@
"""Deterministic transform, compose, and include operations."""
from __future__ import annotations
import re
import shlex
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
import yaml
from markitect_tool.core import parse_markdown
from markitect_tool.query import extract_document
class IncludeError(ValueError):
"""Raised when include resolution cannot continue."""
@dataclass(frozen=True)
class TransformResult:
"""Result of a deterministic Markdown transform."""
markdown: str
operations: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@dataclass(frozen=True)
class ComposeResult:
"""Result of composing multiple Markdown sources."""
markdown: str
sources: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@dataclass(frozen=True)
class IncludeResult:
"""Result of resolving include markers in Markdown."""
markdown: str
included_paths: list[str] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return asdict(self)
_COMMENT_INCLUDE_RE = re.compile(r"<!--\s*mkt:include\s+(?P<attrs>.*?)\s*-->", re.DOTALL)
_BRACE_INCLUDE_RE = re.compile(r"\{\{\s*include:(?P<path>[^}]+?)\s*\}\}")
_HEADING_RE = re.compile(r"^(#{1,6})(\s+.+)$", re.MULTILINE)
def transform_markdown(
markdown: str,
*,
strip_frontmatter: bool = False,
set_frontmatter: dict[str, Any] | None = None,
heading_delta: int = 0,
extract_selector: str | None = None,
source_path: str | None = None,
) -> TransformResult:
"""Apply deterministic operations to one Markdown document."""
operations: list[str] = []
frontmatter, body = _split_frontmatter(markdown)
if set_frontmatter:
frontmatter = _deep_merge(frontmatter, set_frontmatter)
operations.append("set_frontmatter")
if heading_delta:
body = shift_heading_levels(body, heading_delta)
operations.append(f"shift_headings:{heading_delta}")
if extract_selector:
document_text = _join_frontmatter(frontmatter, body) if frontmatter else body
document = parse_markdown(document_text, source_path=source_path)
body = "\n\n".join(extract_document(document, extract_selector))
frontmatter = {}
operations.append(f"extract:{extract_selector}")
if strip_frontmatter:
frontmatter = {}
operations.append("strip_frontmatter")
return TransformResult(markdown=_join_frontmatter(frontmatter, body), operations=operations)
def shift_heading_levels(markdown: str, delta: int) -> str:
"""Shift ATX heading levels by delta while clamping to levels 1 through 6."""
def replace(match: re.Match[str]) -> str:
marks = match.group(1)
suffix = match.group(2)
level = min(max(len(marks) + delta, 1), 6)
return f"{'#' * level}{suffix}"
return _HEADING_RE.sub(replace, markdown)
def compose_files(
paths: list[str | Path],
*,
title: str | None = None,
heading_delta: int = 0,
include_frontmatter: bool = False,
separator: str = "\n\n---\n\n",
) -> ComposeResult:
"""Compose Markdown files into one Markdown output."""
parts: list[str] = []
sources: list[str] = []
if title:
parts.append(f"# {title.strip()}")
for raw_path in paths:
path = Path(raw_path)
text = path.read_text(encoding="utf-8")
frontmatter, body = _split_frontmatter(text)
if include_frontmatter and frontmatter:
body = _join_frontmatter(frontmatter, body)
if heading_delta:
body = shift_heading_levels(body, heading_delta)
body = body.strip()
if body:
parts.append(body)
sources.append(str(path))
return ComposeResult(markdown=separator.join(parts).strip() + "\n", sources=sources)
def resolve_includes(
markdown: str,
*,
base_dir: str | Path,
current_path: str | Path | None = None,
max_depth: int = 10,
) -> IncludeResult:
"""Resolve Markdown include markers recursively.
Supported syntax:
- ``<!-- mkt:include path="relative/file.md" -->``
- ``<!-- mkt:include path="relative/file.md" selector="sections[heading=Intro]" heading_delta="1" -->``
- ``{{include:relative/file.md}}`` for a compact legacy-compatible shorthand.
"""
root = Path(base_dir).resolve()
stack = [Path(current_path).resolve()] if current_path else []
included: list[Path] = []
resolved = _resolve_include_text(
markdown,
root=root,
current_dir=Path(current_path).resolve().parent if current_path else root,
stack=stack,
included=included,
depth=0,
max_depth=max_depth,
)
return IncludeResult(
markdown=resolved,
included_paths=[str(path) for path in included],
)
def _resolve_include_text(
markdown: str,
*,
root: Path,
current_dir: Path,
stack: list[Path],
included: list[Path],
depth: int,
max_depth: int,
) -> str:
if depth > max_depth:
raise IncludeError(f"Include depth exceeded max_depth={max_depth}")
def replace_comment(match: re.Match[str]) -> str:
attrs = _parse_include_attrs(match.group("attrs"))
return _render_include(attrs, root, current_dir, stack, included, depth, max_depth)
def replace_brace(match: re.Match[str]) -> str:
attrs = {"path": match.group("path").strip()}
return _render_include(attrs, root, current_dir, stack, included, depth, max_depth)
markdown = _COMMENT_INCLUDE_RE.sub(replace_comment, markdown)
return _BRACE_INCLUDE_RE.sub(replace_brace, markdown)
def _render_include(
attrs: dict[str, str],
root: Path,
current_dir: Path,
stack: list[Path],
included: list[Path],
depth: int,
max_depth: int,
) -> str:
raw_path = attrs.get("path")
if not raw_path:
raise IncludeError("Include marker requires a path attribute")
include_path = _resolve_safe_path(raw_path, root, current_dir)
if include_path in stack:
cycle = " -> ".join([str(path) for path in stack + [include_path]])
raise IncludeError(f"Circular include detected: {cycle}")
if not include_path.exists() or not include_path.is_file():
raise IncludeError(f"Included file not found: {include_path}")
text = include_path.read_text(encoding="utf-8")
frontmatter, body = _split_frontmatter(text)
selector = attrs.get("selector")
if selector:
document = parse_markdown(text, source_path=str(include_path))
body = "\n\n".join(extract_document(document, selector))
elif attrs.get("include_frontmatter", "").lower() in {"1", "true", "yes"}:
body = _join_frontmatter(frontmatter, body)
heading_delta = int(attrs.get("heading_delta", "0"))
if heading_delta:
body = shift_heading_levels(body, heading_delta)
included.append(include_path)
return _resolve_include_text(
body.strip(),
root=root,
current_dir=include_path.parent,
stack=stack + [include_path],
included=included,
depth=depth + 1,
max_depth=max_depth,
)
def _parse_include_attrs(raw: str) -> dict[str, str]:
attrs: dict[str, str] = {}
for part in shlex.split(raw):
if "=" not in part:
raise IncludeError(f"Invalid include attribute `{part}`")
key, value = part.split("=", 1)
attrs[key.strip()] = value.strip()
return attrs
def _resolve_safe_path(raw_path: str, root: Path, current_dir: Path) -> Path:
candidate = Path(raw_path)
if candidate.is_absolute():
resolved = candidate.resolve()
else:
resolved = (current_dir / candidate).resolve()
try:
resolved.relative_to(root)
except ValueError as exc:
raise IncludeError(f"Included path escapes base directory: {raw_path}") from exc
return resolved
def _split_frontmatter(markdown: str) -> tuple[dict[str, Any], str]:
if not markdown.startswith("---\n"):
return {}, markdown
end = markdown.find("\n---", 4)
if end == -1:
return {}, markdown
closing_end = markdown.find("\n", end + 4)
if closing_end == -1:
closing_end = len(markdown)
else:
closing_end += 1
raw_frontmatter = markdown[4:end]
data = yaml.safe_load(raw_frontmatter) if raw_frontmatter.strip() else {}
if data is None:
data = {}
if not isinstance(data, dict):
return {}, markdown
return data, markdown[closing_end:]
def _join_frontmatter(frontmatter: dict[str, Any], body: str) -> str:
body = body.lstrip("\n")
if not frontmatter:
return body
rendered = yaml.safe_dump(frontmatter, sort_keys=False).strip()
return f"---\n{rendered}\n---\n\n{body}"
def _deep_merge(left: dict[str, Any], right: dict[str, Any]) -> dict[str, Any]:
merged = dict(left)
for key, value in right.items():
if isinstance(merged.get(key), dict) and isinstance(value, dict):
merged[key] = _deep_merge(merged[key], value)
else:
merged[key] = value
return merged