extension for ref resolve, explode, implode, weave, tangle

This commit is contained in:
2026-05-04 02:25:49 +02:00
parent 8203f50fd5
commit 65bfc1aebf
39 changed files with 3959 additions and 25 deletions

View File

@@ -0,0 +1,25 @@
"""Namespaced content reference resolution for Markdown artifacts."""
from markitect_tool.reference.engine import (
ContentUnit,
ReferenceAddress,
ReferenceContext,
ReferenceResolution,
ReferenceResolutionError,
SourceSpan,
load_namespaces,
parse_reference,
resolve_reference,
)
__all__ = [
"ContentUnit",
"ReferenceAddress",
"ReferenceContext",
"ReferenceResolution",
"ReferenceResolutionError",
"SourceSpan",
"load_namespaces",
"parse_reference",
"resolve_reference",
]

View File

@@ -0,0 +1,626 @@
"""Reference parsing and resolution for Markdown content units."""
from __future__ import annotations
import hashlib
import re
import shlex
from dataclasses import asdict, dataclass, field
from pathlib import Path
from typing import Any
from markdown_it import MarkdownIt
from markitect_tool.core import ContentBlock, Document, Heading, Section, parse_markdown
from markitect_tool.query import InvalidQueryError, QueryMatch, query_document
class ReferenceResolutionError(ValueError):
"""Raised when a content reference cannot be resolved."""
@dataclass(frozen=True)
class ReferenceAddress:
"""Parsed content reference address.
Syntax is intentionally compact and Markdown-friendly:
- ``path/to/file.md``
- ``std:clauses/payment.md``
- ``std:clauses/payment.md#section:terms``
- ``std:clauses/payment.md::sections[heading=Terms]``
- ``#intro`` for a fragment in the current document
"""
raw: str
namespace: str | None = None
address: str = ""
fragment: str | None = None
selector: str | None = None
def to_dict(self) -> dict[str, Any]:
return {
key: value
for key, value in asdict(self).items()
if value is not None and value != ""
}
@dataclass(frozen=True)
class ReferenceContext:
"""Inputs used to resolve namespaced and relative content references."""
root: Path = Path(".")
current_path: Path | None = None
namespaces: dict[str, str] = field(default_factory=dict)
@classmethod
def from_document(
cls,
document: Document,
*,
root: str | Path = ".",
current_path: str | Path | None = None,
) -> "ReferenceContext":
"""Build a reference context from document frontmatter."""
source_path = current_path or document.source_path
return cls(
root=Path(root),
current_path=Path(source_path) if source_path else None,
namespaces=load_namespaces(document.frontmatter),
)
def to_dict(self) -> dict[str, Any]:
data = {
"root": str(self.root),
"current_path": str(self.current_path) if self.current_path else None,
"namespaces": self.namespaces,
}
return {key: value for key, value in data.items() if value is not None}
@dataclass(frozen=True)
class SourceSpan:
"""Line span for a resolved unit in its source file."""
line_start: int | None = None
line_end: int | None = None
def to_dict(self) -> dict[str, Any]:
return {key: value for key, value in asdict(self).items() if value is not None}
@dataclass(frozen=True)
class ContentUnit:
"""One addressable content unit resolved from Markdown."""
kind: str
unit_id: str
text: str
source_path: str
span: SourceSpan | None = None
name: str | None = None
content_hash: str = ""
metadata: dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> dict[str, Any]:
data = {
"kind": self.kind,
"unit_id": self.unit_id,
"name": self.name,
"source_path": self.source_path,
"span": self.span.to_dict() if self.span else None,
"content_hash": self.content_hash,
"metadata": self.metadata or None,
"text": self.text,
}
return {key: value for key, value in data.items() if value is not None}
@dataclass(frozen=True)
class ReferenceResolution:
"""Resolved content reference and its dependency edge."""
reference: ReferenceAddress
source_path: str
target_path: str
units: list[ContentUnit]
def to_dict(self) -> dict[str, Any]:
return {
"reference": self.reference.to_dict(),
"source_path": self.source_path,
"target_path": self.target_path,
"count": len(self.units),
"units": [unit.to_dict() for unit in self.units],
}
_NAMESPACE_RE = re.compile(r"^(?P<namespace>[A-Za-z][A-Za-z0-9_.-]*):(?P<address>.*)$")
_HEADING_ID_RE = re.compile(r"^(?P<title>.*?)(?:\s+\{#(?P<id>[A-Za-z0-9_.:-]+)\})?$")
_REGION_OPEN_RE = re.compile(r"<!--\s*mkt:region\s+(?P<attrs>.*?)\s*-->")
_REGION_CLOSE_RE = re.compile(r"<!--\s*/mkt:region\s*-->")
_FENCE_ATTRS_RE = re.compile(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$")
def parse_reference(reference: str) -> ReferenceAddress:
"""Parse a compact Markitect content reference."""
raw = reference.strip()
if not raw:
raise ReferenceResolutionError("Reference cannot be empty")
selector: str | None = None
base = raw
if "::" in base:
base, selector = base.split("::", 1)
selector = selector.strip()
if not selector:
raise ReferenceResolutionError(f"Reference selector is empty in `{reference}`")
fragment: str | None = None
if "#" in base:
base, fragment = base.split("#", 1)
fragment = fragment.strip()
if not fragment:
raise ReferenceResolutionError(f"Reference fragment is empty in `{reference}`")
namespace: str | None = None
address = base.strip()
match = _NAMESPACE_RE.match(address)
if match and "/" not in match.group("namespace") and "\\" not in match.group("namespace"):
namespace = match.group("namespace")
address = match.group("address").strip()
return ReferenceAddress(
raw=raw,
namespace=namespace,
address=address,
fragment=fragment,
selector=selector,
)
def load_namespaces(frontmatter: dict[str, Any]) -> dict[str, str]:
"""Load namespace mappings from Markdown frontmatter."""
raw_namespaces = frontmatter.get("namespaces", {})
if raw_namespaces is None:
return {}
if not isinstance(raw_namespaces, dict):
raise ReferenceResolutionError("Frontmatter `namespaces` must be a mapping")
namespaces: dict[str, str] = {}
for raw_key, raw_value in raw_namespaces.items():
key = str(raw_key).strip().rstrip(":")
if not key:
raise ReferenceResolutionError("Namespace keys cannot be empty")
if not _NAMESPACE_RE.match(f"{key}:"):
raise ReferenceResolutionError(f"Invalid namespace key `{raw_key}`")
if not isinstance(raw_value, str):
raise ReferenceResolutionError(f"Namespace `{key}` must map to a string path")
value = raw_value.strip()
if not value:
raise ReferenceResolutionError(f"Namespace `{key}` cannot map to an empty path")
namespaces[key] = value
return namespaces
def resolve_reference(
reference: str | ReferenceAddress,
*,
context: ReferenceContext,
) -> ReferenceResolution:
"""Resolve a content reference to one or more content units."""
address = parse_reference(reference) if isinstance(reference, str) else reference
root = context.root.resolve()
source_path = context.current_path.resolve() if context.current_path else root
target_path = _resolve_target_path(address, context, root, source_path)
if not target_path.exists() or not target_path.is_file():
raise ReferenceResolutionError(f"Referenced file not found: {target_path}")
markdown = target_path.read_text(encoding="utf-8")
document = parse_markdown(markdown, source_path=str(target_path))
if address.selector and address.fragment:
raise ReferenceResolutionError("Reference cannot use both fragment and selector")
if address.selector:
units = _units_from_selector(document, address.selector, target_path)
elif address.fragment:
units = _units_from_fragment(document, address.fragment, target_path, markdown)
else:
units = [_document_unit(document, target_path, markdown)]
if not units:
raise ReferenceResolutionError(f"Reference `{address.raw}` did not match any content units")
return ReferenceResolution(
reference=address,
source_path=str(source_path),
target_path=str(target_path),
units=units,
)
def _resolve_target_path(
address: ReferenceAddress,
context: ReferenceContext,
root: Path,
source_path: Path,
) -> Path:
if address.namespace:
if address.namespace not in context.namespaces:
raise ReferenceResolutionError(f"Unknown namespace `{address.namespace}`")
namespace_target = _path_from_namespace(context.namespaces[address.namespace], root)
candidate = namespace_target / address.address if namespace_target.is_dir() else namespace_target
elif address.address:
base_dir = source_path.parent if source_path.is_file() else root
candidate = Path(address.address)
candidate = candidate if candidate.is_absolute() else base_dir / candidate
elif context.current_path:
candidate = context.current_path
else:
raise ReferenceResolutionError("Pathless references require a current document")
resolved = candidate.resolve()
try:
resolved.relative_to(root)
except ValueError as exc:
raise ReferenceResolutionError(f"Reference escapes root: {address.raw}") from exc
return resolved
def _path_from_namespace(raw_path: str, root: Path) -> Path:
path = Path(raw_path)
if not path.is_absolute():
path = root / path
return path.resolve()
def _units_from_selector(
document: Document,
selector: str,
target_path: Path,
) -> list[ContentUnit]:
try:
matches = query_document(document, selector)
except InvalidQueryError as exc:
raise ReferenceResolutionError(str(exc)) from exc
return [_unit_from_query_match(match, target_path) for match in matches]
def _units_from_fragment(
document: Document,
fragment: str,
target_path: Path,
markdown: str,
) -> list[ContentUnit]:
kind, _, value = fragment.partition(":")
if not value:
kind, value = "id", kind
lookup = _slug(value)
if kind == "document":
return [_document_unit(document, target_path, markdown)]
if kind == "id":
for units in [
_section_units(document, target_path),
_region_units(markdown, target_path),
_fenced_block_units(markdown, target_path),
_heading_units(document, target_path),
]:
matches = [
unit for unit in units if unit.unit_id == lookup or _slug(unit.name or "") == lookup
]
if matches:
return matches
return []
if kind in {"id", "section"}:
sections = _section_units(document, target_path)
return [unit for unit in sections if unit.unit_id == lookup or _slug(unit.name or "") == lookup]
if kind == "heading":
headings = _heading_units(document, target_path)
return [unit for unit in headings if unit.unit_id == lookup or _slug(unit.name or "") == lookup]
if kind == "block":
return _block_fragment_units(document, target_path, value)
if kind == "region":
return [unit for unit in _region_units(markdown, target_path) if unit.unit_id == lookup]
if kind == "fence":
return [unit for unit in _fenced_block_units(markdown, target_path) if unit.unit_id == lookup]
if kind == "tag":
return [
unit
for unit in _region_units(markdown, target_path) + _fenced_block_units(markdown, target_path)
if lookup in {_slug(tag) for tag in unit.metadata.get("tags", [])}
]
if kind == "line":
return _line_range_units(markdown, target_path, value)
raise ReferenceResolutionError(f"Unsupported reference fragment kind `{kind}`")
def _document_unit(document: Document, target_path: Path, markdown: str) -> ContentUnit:
unit_id = _slug(str(document.frontmatter.get("id") or target_path.stem))
return _content_unit(
kind="document",
unit_id=unit_id,
text=markdown,
source_path=target_path,
span=SourceSpan(1, len(markdown.splitlines())),
name=str(document.frontmatter.get("title") or target_path.stem),
metadata={"frontmatter": document.frontmatter},
)
def _unit_from_query_match(match: QueryMatch, target_path: Path) -> ContentUnit:
unit_id = _slug(match.path.replace("$.", "").replace("[", "-").replace("]", ""))
name = match.text.splitlines()[0].lstrip("# ").strip() if match.text else match.kind
return _content_unit(
kind=match.kind,
unit_id=unit_id,
text=match.text if match.text is not None else str(match.value),
source_path=target_path,
span=SourceSpan(match.line, None),
name=name,
metadata={"query_path": match.path, "value": match.value},
)
def _section_units(document: Document, target_path: Path) -> list[ContentUnit]:
used_ids: dict[str, int] = {}
return [
_section_unit(section, target_path, used_ids)
for section in document.sections
]
def _section_unit(
section: Section,
target_path: Path,
used_ids: dict[str, int],
) -> ContentUnit:
title, explicit_id = _heading_title_and_id(section.heading)
unit_id = _dedupe_id(_slug(explicit_id or title), used_ids)
line_end = section.blocks[-1].line_end if section.blocks else section.heading.line
lines = [f"{'#' * section.heading.level} {section.heading.text}"]
for block in section.blocks:
if block.text:
lines.extend(["", block.text])
return _content_unit(
kind="section",
unit_id=unit_id,
text="\n".join(lines).strip(),
source_path=target_path,
span=SourceSpan(section.heading.line, line_end),
name=title,
metadata={"heading_level": section.heading.level},
)
def _heading_units(document: Document, target_path: Path) -> list[ContentUnit]:
used_ids: dict[str, int] = {}
units: list[ContentUnit] = []
for heading in document.headings:
title, explicit_id = _heading_title_and_id(heading)
unit_id = _dedupe_id(_slug(explicit_id or title), used_ids)
units.append(
_content_unit(
kind="heading",
unit_id=unit_id,
text=f"{'#' * heading.level} {heading.text}",
source_path=target_path,
span=SourceSpan(heading.line, heading.line),
name=title,
metadata={"heading_level": heading.level},
)
)
return units
def _block_fragment_units(
document: Document,
target_path: Path,
value: str,
) -> list[ContentUnit]:
blocks = _block_units(document.blocks, target_path)
if value.isdigit():
index = int(value)
return [blocks[index]] if 0 <= index < len(blocks) else []
lookup = _slug(value)
return [unit for unit in blocks if unit.unit_id == lookup]
def _block_units(blocks: list[ContentBlock], target_path: Path) -> list[ContentUnit]:
used_ids: dict[str, int] = {}
units: list[ContentUnit] = []
for index, block in enumerate(blocks):
base_id = f"{block.type}-{block.line_start or index}"
units.append(
_content_unit(
kind=block.type,
unit_id=_dedupe_id(_slug(base_id), used_ids),
text=block.text,
source_path=target_path,
span=SourceSpan(block.line_start, block.line_end),
name=block.type,
metadata={"block_index": index},
)
)
return units
def _region_units(markdown: str, target_path: Path) -> list[ContentUnit]:
lines = markdown.splitlines()
units: list[ContentUnit] = []
open_region: tuple[int, str, list[str]] | None = None
for index, line in enumerate(lines, start=1):
open_match = _REGION_OPEN_RE.search(line)
close_match = _REGION_CLOSE_RE.search(line)
if open_match and open_region is not None:
raise ReferenceResolutionError("Nested mkt:region blocks are not supported")
if close_match:
if open_region is None:
raise ReferenceResolutionError("Region close marker has no matching open marker")
start_line, region_id, tags = open_region
content_lines = lines[start_line:index - 1]
units.append(
_content_unit(
kind="region",
unit_id=_slug(region_id),
text="\n".join(content_lines).strip(),
source_path=target_path,
span=SourceSpan(start_line, index),
name=region_id,
metadata={"tags": tags},
)
)
open_region = None
continue
if open_match:
attrs = _parse_attrs(open_match.group("attrs"))
region_id = attrs.get("id")
if not region_id:
raise ReferenceResolutionError("Region marker requires an id attribute")
open_region = (index, region_id, _tags_from_attrs(attrs))
if open_region is not None:
raise ReferenceResolutionError("Region open marker has no matching close marker")
return units
def _fenced_block_units(markdown: str, target_path: Path) -> list[ContentUnit]:
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
units: list[ContentUnit] = []
used_ids: dict[str, int] = {}
for index, token in enumerate(parser.parse(markdown)):
if token.type != "fence":
continue
attrs = _parse_fence_info(token.info)
unit_id = attrs.get("id")
if not unit_id:
continue
line_start = token.map[0] + 1 if token.map else None
line_end = token.map[1] if token.map else None
units.append(
_content_unit(
kind="fenced_block",
unit_id=_dedupe_id(_slug(unit_id), used_ids),
text=token.content,
source_path=target_path,
span=SourceSpan(line_start, line_end),
name=unit_id,
metadata={
"language": attrs.get("language"),
"tags": _tags_from_attrs(attrs),
"attrs": {
key: value
for key, value in attrs.items()
if key not in {"id", "language", "tag", "tags"}
},
"block_index": index,
},
)
)
return units
def _line_range_units(markdown: str, target_path: Path, value: str) -> list[ContentUnit]:
match = re.match(r"^(?P<start>\d+)(?:-(?P<end>\d+))?$", value)
if not match:
raise ReferenceResolutionError("Line fragments must use `line:start` or `line:start-end`")
start = int(match.group("start"))
end = int(match.group("end") or start)
lines = markdown.splitlines()
if start < 1 or end < start or end > len(lines):
return []
text = "\n".join(lines[start - 1:end])
return [
_content_unit(
kind="line_range",
unit_id=f"line-{start}-{end}",
text=text,
source_path=target_path,
span=SourceSpan(start, end),
name=f"lines {start}-{end}",
metadata={},
)
]
def _parse_fence_info(info: str) -> dict[str, str]:
match = _FENCE_ATTRS_RE.match(info.strip())
if not match:
return {"language": info.strip()} if info.strip() else {}
attrs = _parse_attrs(match.group("attrs") or "")
language = match.group("language")
if language:
attrs["language"] = language
if "id" not in attrs and attrs:
for key in list(attrs):
if key.startswith("#"):
attrs["id"] = key[1:]
del attrs[key]
break
return attrs
def _parse_attrs(raw: str) -> dict[str, str]:
attrs: dict[str, str] = {}
for part in shlex.split(raw):
if part.startswith("#") and len(part) > 1:
attrs["id"] = part[1:]
continue
if "=" not in part:
attrs[part] = "true"
continue
key, value = part.split("=", 1)
attrs[key.strip()] = value.strip()
return attrs
def _tags_from_attrs(attrs: dict[str, str]) -> list[str]:
raw = attrs.get("tags") or attrs.get("tag") or ""
return [tag.strip() for tag in re.split(r"[, ]+", raw) if tag.strip()]
def _content_unit(
*,
kind: str,
unit_id: str,
text: str,
source_path: Path,
span: SourceSpan | None,
name: str | None,
metadata: dict[str, Any] | None = None,
) -> ContentUnit:
return ContentUnit(
kind=kind,
unit_id=unit_id,
text=text,
source_path=str(source_path),
span=span,
name=name,
content_hash="sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest(),
metadata=metadata or {},
)
def _heading_title_and_id(heading: Heading) -> tuple[str, str | None]:
match = _HEADING_ID_RE.match(heading.text.strip())
if not match:
return heading.text.strip(), None
return match.group("title").strip(), match.group("id")
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
count = used_ids.get(unit_id, 0) + 1
used_ids[unit_id] = count
return unit_id if count == 1 else f"{unit_id}-{count}"
def _slug(value: str) -> str:
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
slug = re.sub(r"-+", "-", slug).strip("-")
return slug or "unit"