extension for ref resolve, explode, implode, weave, tangle

2026-05-04 02:25:49 +02:00
parent 8203f50fd5
commit 65bfc1aebf
39 changed files with 3959 additions and 25 deletions
--- a/src/markitect_tool/reference/init.py
+++ b/src/markitect_tool/reference/init.py
@@ -0,0 +1,25 @@
+"""Namespaced content reference resolution for Markdown artifacts."""
+
+from markitect_tool.reference.engine import (
+    ContentUnit,
+    ReferenceAddress,
+    ReferenceContext,
+    ReferenceResolution,
+    ReferenceResolutionError,
+    SourceSpan,
+    load_namespaces,
+    parse_reference,
+    resolve_reference,
+)
+
+__all__ = [
+    "ContentUnit",
+    "ReferenceAddress",
+    "ReferenceContext",
+    "ReferenceResolution",
+    "ReferenceResolutionError",
+    "SourceSpan",
+    "load_namespaces",
+    "parse_reference",
+    "resolve_reference",
+]
--- a/src/markitect_tool/reference/engine.py
+++ b/src/markitect_tool/reference/engine.py
@@ -0,0 +1,626 @@
+"""Reference parsing and resolution for Markdown content units."""
+
+from __future__ import annotations
+
+import hashlib
+import re
+import shlex
+from dataclasses import asdict, dataclass, field
+from pathlib import Path
+from typing import Any
+
+from markdown_it import MarkdownIt
+
+from markitect_tool.core import ContentBlock, Document, Heading, Section, parse_markdown
+from markitect_tool.query import InvalidQueryError, QueryMatch, query_document
+
+
+class ReferenceResolutionError(ValueError):
+    """Raised when a content reference cannot be resolved."""
+
+
+@dataclass(frozen=True)
+class ReferenceAddress:
+    """Parsed content reference address.
+
+    Syntax is intentionally compact and Markdown-friendly:
+
+    - ``path/to/file.md``
+    - ``std:clauses/payment.md``
+    - ``std:clauses/payment.md#section:terms``
+    - ``std:clauses/payment.md::sections[heading=Terms]``
+    - ``#intro`` for a fragment in the current document
+    """
+
+    raw: str
+    namespace: str | None = None
+    address: str = ""
+    fragment: str | None = None
+    selector: str | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            key: value
+            for key, value in asdict(self).items()
+            if value is not None and value != ""
+        }
+
+
+@dataclass(frozen=True)
+class ReferenceContext:
+    """Inputs used to resolve namespaced and relative content references."""
+
+    root: Path = Path(".")
+    current_path: Path | None = None
+    namespaces: dict[str, str] = field(default_factory=dict)
+
+    @classmethod
+    def from_document(
+        cls,
+        document: Document,
+        *,
+        root: str | Path = ".",
+        current_path: str | Path | None = None,
+    ) -> "ReferenceContext":
+        """Build a reference context from document frontmatter."""
+
+        source_path = current_path or document.source_path
+        return cls(
+            root=Path(root),
+            current_path=Path(source_path) if source_path else None,
+            namespaces=load_namespaces(document.frontmatter),
+        )
+
+    def to_dict(self) -> dict[str, Any]:
+        data = {
+            "root": str(self.root),
+            "current_path": str(self.current_path) if self.current_path else None,
+            "namespaces": self.namespaces,
+        }
+        return {key: value for key, value in data.items() if value is not None}
+
+
+@dataclass(frozen=True)
+class SourceSpan:
+    """Line span for a resolved unit in its source file."""
+
+    line_start: int | None = None
+    line_end: int | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {key: value for key, value in asdict(self).items() if value is not None}
+
+
+@dataclass(frozen=True)
+class ContentUnit:
+    """One addressable content unit resolved from Markdown."""
+
+    kind: str
+    unit_id: str
+    text: str
+    source_path: str
+    span: SourceSpan | None = None
+    name: str | None = None
+    content_hash: str = ""
+    metadata: dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> dict[str, Any]:
+        data = {
+            "kind": self.kind,
+            "unit_id": self.unit_id,
+            "name": self.name,
+            "source_path": self.source_path,
+            "span": self.span.to_dict() if self.span else None,
+            "content_hash": self.content_hash,
+            "metadata": self.metadata or None,
+            "text": self.text,
+        }
+        return {key: value for key, value in data.items() if value is not None}
+
+
+@dataclass(frozen=True)
+class ReferenceResolution:
+    """Resolved content reference and its dependency edge."""
+
+    reference: ReferenceAddress
+    source_path: str
+    target_path: str
+    units: list[ContentUnit]
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "reference": self.reference.to_dict(),
+            "source_path": self.source_path,
+            "target_path": self.target_path,
+            "count": len(self.units),
+            "units": [unit.to_dict() for unit in self.units],
+        }
+
+
+_NAMESPACE_RE = re.compile(r"^(?P<namespace>[A-Za-z][A-Za-z0-9_.-]*):(?P<address>.*)$")
+_HEADING_ID_RE = re.compile(r"^(?P<title>.*?)(?:\s+\{#(?P<id>[A-Za-z0-9_.:-]+)\})?$")
+_REGION_OPEN_RE = re.compile(r"<!--\s*mkt:region\s+(?P<attrs>.*?)\s*-->")
+_REGION_CLOSE_RE = re.compile(r"<!--\s*/mkt:region\s*-->")
+_FENCE_ATTRS_RE = re.compile(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$")
+
+
+def parse_reference(reference: str) -> ReferenceAddress:
+    """Parse a compact Markitect content reference."""
+
+    raw = reference.strip()
+    if not raw:
+        raise ReferenceResolutionError("Reference cannot be empty")
+
+    selector: str | None = None
+    base = raw
+    if "::" in base:
+        base, selector = base.split("::", 1)
+        selector = selector.strip()
+        if not selector:
+            raise ReferenceResolutionError(f"Reference selector is empty in `{reference}`")
+
+    fragment: str | None = None
+    if "#" in base:
+        base, fragment = base.split("#", 1)
+        fragment = fragment.strip()
+        if not fragment:
+            raise ReferenceResolutionError(f"Reference fragment is empty in `{reference}`")
+
+    namespace: str | None = None
+    address = base.strip()
+    match = _NAMESPACE_RE.match(address)
+    if match and "/" not in match.group("namespace") and "\\" not in match.group("namespace"):
+        namespace = match.group("namespace")
+        address = match.group("address").strip()
+
+    return ReferenceAddress(
+        raw=raw,
+        namespace=namespace,
+        address=address,
+        fragment=fragment,
+        selector=selector,
+    )
+
+
+def load_namespaces(frontmatter: dict[str, Any]) -> dict[str, str]:
+    """Load namespace mappings from Markdown frontmatter."""
+
+    raw_namespaces = frontmatter.get("namespaces", {})
+    if raw_namespaces is None:
+        return {}
+    if not isinstance(raw_namespaces, dict):
+        raise ReferenceResolutionError("Frontmatter `namespaces` must be a mapping")
+
+    namespaces: dict[str, str] = {}
+    for raw_key, raw_value in raw_namespaces.items():
+        key = str(raw_key).strip().rstrip(":")
+        if not key:
+            raise ReferenceResolutionError("Namespace keys cannot be empty")
+        if not _NAMESPACE_RE.match(f"{key}:"):
+            raise ReferenceResolutionError(f"Invalid namespace key `{raw_key}`")
+        if not isinstance(raw_value, str):
+            raise ReferenceResolutionError(f"Namespace `{key}` must map to a string path")
+        value = raw_value.strip()
+        if not value:
+            raise ReferenceResolutionError(f"Namespace `{key}` cannot map to an empty path")
+        namespaces[key] = value
+    return namespaces
+
+
+def resolve_reference(
+    reference: str | ReferenceAddress,
+    *,
+    context: ReferenceContext,
+) -> ReferenceResolution:
+    """Resolve a content reference to one or more content units."""
+
+    address = parse_reference(reference) if isinstance(reference, str) else reference
+    root = context.root.resolve()
+    source_path = context.current_path.resolve() if context.current_path else root
+    target_path = _resolve_target_path(address, context, root, source_path)
+    if not target_path.exists() or not target_path.is_file():
+        raise ReferenceResolutionError(f"Referenced file not found: {target_path}")
+
+    markdown = target_path.read_text(encoding="utf-8")
+    document = parse_markdown(markdown, source_path=str(target_path))
+
+    if address.selector and address.fragment:
+        raise ReferenceResolutionError("Reference cannot use both fragment and selector")
+    if address.selector:
+        units = _units_from_selector(document, address.selector, target_path)
+    elif address.fragment:
+        units = _units_from_fragment(document, address.fragment, target_path, markdown)
+    else:
+        units = [_document_unit(document, target_path, markdown)]
+
+    if not units:
+        raise ReferenceResolutionError(f"Reference `{address.raw}` did not match any content units")
+
+    return ReferenceResolution(
+        reference=address,
+        source_path=str(source_path),
+        target_path=str(target_path),
+        units=units,
+    )
+
+
+def _resolve_target_path(
+    address: ReferenceAddress,
+    context: ReferenceContext,
+    root: Path,
+    source_path: Path,
+) -> Path:
+    if address.namespace:
+        if address.namespace not in context.namespaces:
+            raise ReferenceResolutionError(f"Unknown namespace `{address.namespace}`")
+        namespace_target = _path_from_namespace(context.namespaces[address.namespace], root)
+        candidate = namespace_target / address.address if namespace_target.is_dir() else namespace_target
+    elif address.address:
+        base_dir = source_path.parent if source_path.is_file() else root
+        candidate = Path(address.address)
+        candidate = candidate if candidate.is_absolute() else base_dir / candidate
+    elif context.current_path:
+        candidate = context.current_path
+    else:
+        raise ReferenceResolutionError("Pathless references require a current document")
+
+    resolved = candidate.resolve()
+    try:
+        resolved.relative_to(root)
+    except ValueError as exc:
+        raise ReferenceResolutionError(f"Reference escapes root: {address.raw}") from exc
+    return resolved
+
+
+def _path_from_namespace(raw_path: str, root: Path) -> Path:
+    path = Path(raw_path)
+    if not path.is_absolute():
+        path = root / path
+    return path.resolve()
+
+
+def _units_from_selector(
+    document: Document,
+    selector: str,
+    target_path: Path,
+) -> list[ContentUnit]:
+    try:
+        matches = query_document(document, selector)
+    except InvalidQueryError as exc:
+        raise ReferenceResolutionError(str(exc)) from exc
+    return [_unit_from_query_match(match, target_path) for match in matches]
+
+
+def _units_from_fragment(
+    document: Document,
+    fragment: str,
+    target_path: Path,
+    markdown: str,
+) -> list[ContentUnit]:
+    kind, _, value = fragment.partition(":")
+    if not value:
+        kind, value = "id", kind
+    lookup = _slug(value)
+
+    if kind == "document":
+        return [_document_unit(document, target_path, markdown)]
+    if kind == "id":
+        for units in [
+            _section_units(document, target_path),
+            _region_units(markdown, target_path),
+            _fenced_block_units(markdown, target_path),
+            _heading_units(document, target_path),
+        ]:
+            matches = [
+                unit for unit in units if unit.unit_id == lookup or _slug(unit.name or "") == lookup
+            ]
+            if matches:
+                return matches
+        return []
+    if kind in {"id", "section"}:
+        sections = _section_units(document, target_path)
+        return [unit for unit in sections if unit.unit_id == lookup or _slug(unit.name or "") == lookup]
+    if kind == "heading":
+        headings = _heading_units(document, target_path)
+        return [unit for unit in headings if unit.unit_id == lookup or _slug(unit.name or "") == lookup]
+    if kind == "block":
+        return _block_fragment_units(document, target_path, value)
+    if kind == "region":
+        return [unit for unit in _region_units(markdown, target_path) if unit.unit_id == lookup]
+    if kind == "fence":
+        return [unit for unit in _fenced_block_units(markdown, target_path) if unit.unit_id == lookup]
+    if kind == "tag":
+        return [
+            unit
+            for unit in _region_units(markdown, target_path) + _fenced_block_units(markdown, target_path)
+            if lookup in {_slug(tag) for tag in unit.metadata.get("tags", [])}
+        ]
+    if kind == "line":
+        return _line_range_units(markdown, target_path, value)
+    raise ReferenceResolutionError(f"Unsupported reference fragment kind `{kind}`")
+
+
+def _document_unit(document: Document, target_path: Path, markdown: str) -> ContentUnit:
+    unit_id = _slug(str(document.frontmatter.get("id") or target_path.stem))
+    return _content_unit(
+        kind="document",
+        unit_id=unit_id,
+        text=markdown,
+        source_path=target_path,
+        span=SourceSpan(1, len(markdown.splitlines())),
+        name=str(document.frontmatter.get("title") or target_path.stem),
+        metadata={"frontmatter": document.frontmatter},
+    )
+
+
+def _unit_from_query_match(match: QueryMatch, target_path: Path) -> ContentUnit:
+    unit_id = _slug(match.path.replace("$.", "").replace("[", "-").replace("]", ""))
+    name = match.text.splitlines()[0].lstrip("# ").strip() if match.text else match.kind
+    return _content_unit(
+        kind=match.kind,
+        unit_id=unit_id,
+        text=match.text if match.text is not None else str(match.value),
+        source_path=target_path,
+        span=SourceSpan(match.line, None),
+        name=name,
+        metadata={"query_path": match.path, "value": match.value},
+    )
+
+
+def _section_units(document: Document, target_path: Path) -> list[ContentUnit]:
+    used_ids: dict[str, int] = {}
+    return [
+        _section_unit(section, target_path, used_ids)
+        for section in document.sections
+    ]
+
+
+def _section_unit(
+    section: Section,
+    target_path: Path,
+    used_ids: dict[str, int],
+) -> ContentUnit:
+    title, explicit_id = _heading_title_and_id(section.heading)
+    unit_id = _dedupe_id(_slug(explicit_id or title), used_ids)
+    line_end = section.blocks[-1].line_end if section.blocks else section.heading.line
+    lines = [f"{'#' * section.heading.level} {section.heading.text}"]
+    for block in section.blocks:
+        if block.text:
+            lines.extend(["", block.text])
+    return _content_unit(
+        kind="section",
+        unit_id=unit_id,
+        text="\n".join(lines).strip(),
+        source_path=target_path,
+        span=SourceSpan(section.heading.line, line_end),
+        name=title,
+        metadata={"heading_level": section.heading.level},
+    )
+
+
+def _heading_units(document: Document, target_path: Path) -> list[ContentUnit]:
+    used_ids: dict[str, int] = {}
+    units: list[ContentUnit] = []
+    for heading in document.headings:
+        title, explicit_id = _heading_title_and_id(heading)
+        unit_id = _dedupe_id(_slug(explicit_id or title), used_ids)
+        units.append(
+            _content_unit(
+                kind="heading",
+                unit_id=unit_id,
+                text=f"{'#' * heading.level} {heading.text}",
+                source_path=target_path,
+                span=SourceSpan(heading.line, heading.line),
+                name=title,
+                metadata={"heading_level": heading.level},
+            )
+        )
+    return units
+
+
+def _block_fragment_units(
+    document: Document,
+    target_path: Path,
+    value: str,
+) -> list[ContentUnit]:
+    blocks = _block_units(document.blocks, target_path)
+    if value.isdigit():
+        index = int(value)
+        return [blocks[index]] if 0 <= index < len(blocks) else []
+    lookup = _slug(value)
+    return [unit for unit in blocks if unit.unit_id == lookup]
+
+
+def _block_units(blocks: list[ContentBlock], target_path: Path) -> list[ContentUnit]:
+    used_ids: dict[str, int] = {}
+    units: list[ContentUnit] = []
+    for index, block in enumerate(blocks):
+        base_id = f"{block.type}-{block.line_start or index}"
+        units.append(
+            _content_unit(
+                kind=block.type,
+                unit_id=_dedupe_id(_slug(base_id), used_ids),
+                text=block.text,
+                source_path=target_path,
+                span=SourceSpan(block.line_start, block.line_end),
+                name=block.type,
+                metadata={"block_index": index},
+            )
+        )
+    return units
+
+
+def _region_units(markdown: str, target_path: Path) -> list[ContentUnit]:
+    lines = markdown.splitlines()
+    units: list[ContentUnit] = []
+    open_region: tuple[int, str, list[str]] | None = None
+
+    for index, line in enumerate(lines, start=1):
+        open_match = _REGION_OPEN_RE.search(line)
+        close_match = _REGION_CLOSE_RE.search(line)
+        if open_match and open_region is not None:
+            raise ReferenceResolutionError("Nested mkt:region blocks are not supported")
+        if close_match:
+            if open_region is None:
+                raise ReferenceResolutionError("Region close marker has no matching open marker")
+            start_line, region_id, tags = open_region
+            content_lines = lines[start_line:index - 1]
+            units.append(
+                _content_unit(
+                    kind="region",
+                    unit_id=_slug(region_id),
+                    text="\n".join(content_lines).strip(),
+                    source_path=target_path,
+                    span=SourceSpan(start_line, index),
+                    name=region_id,
+                    metadata={"tags": tags},
+                )
+            )
+            open_region = None
+            continue
+        if open_match:
+            attrs = _parse_attrs(open_match.group("attrs"))
+            region_id = attrs.get("id")
+            if not region_id:
+                raise ReferenceResolutionError("Region marker requires an id attribute")
+            open_region = (index, region_id, _tags_from_attrs(attrs))
+
+    if open_region is not None:
+        raise ReferenceResolutionError("Region open marker has no matching close marker")
+    return units
+
+
+def _fenced_block_units(markdown: str, target_path: Path) -> list[ContentUnit]:
+    parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
+    units: list[ContentUnit] = []
+    used_ids: dict[str, int] = {}
+    for index, token in enumerate(parser.parse(markdown)):
+        if token.type != "fence":
+            continue
+        attrs = _parse_fence_info(token.info)
+        unit_id = attrs.get("id")
+        if not unit_id:
+            continue
+        line_start = token.map[0] + 1 if token.map else None
+        line_end = token.map[1] if token.map else None
+        units.append(
+            _content_unit(
+                kind="fenced_block",
+                unit_id=_dedupe_id(_slug(unit_id), used_ids),
+                text=token.content,
+                source_path=target_path,
+                span=SourceSpan(line_start, line_end),
+                name=unit_id,
+                metadata={
+                    "language": attrs.get("language"),
+                    "tags": _tags_from_attrs(attrs),
+                    "attrs": {
+                        key: value
+                        for key, value in attrs.items()
+                        if key not in {"id", "language", "tag", "tags"}
+                    },
+                    "block_index": index,
+                },
+            )
+        )
+    return units
+
+
+def _line_range_units(markdown: str, target_path: Path, value: str) -> list[ContentUnit]:
+    match = re.match(r"^(?P<start>\d+)(?:-(?P<end>\d+))?$", value)
+    if not match:
+        raise ReferenceResolutionError("Line fragments must use `line:start` or `line:start-end`")
+    start = int(match.group("start"))
+    end = int(match.group("end") or start)
+    lines = markdown.splitlines()
+    if start < 1 or end < start or end > len(lines):
+        return []
+    text = "\n".join(lines[start - 1:end])
+    return [
+        _content_unit(
+            kind="line_range",
+            unit_id=f"line-{start}-{end}",
+            text=text,
+            source_path=target_path,
+            span=SourceSpan(start, end),
+            name=f"lines {start}-{end}",
+            metadata={},
+        )
+    ]
+
+
+def _parse_fence_info(info: str) -> dict[str, str]:
+    match = _FENCE_ATTRS_RE.match(info.strip())
+    if not match:
+        return {"language": info.strip()} if info.strip() else {}
+    attrs = _parse_attrs(match.group("attrs") or "")
+    language = match.group("language")
+    if language:
+        attrs["language"] = language
+    if "id" not in attrs and attrs:
+        for key in list(attrs):
+            if key.startswith("#"):
+                attrs["id"] = key[1:]
+                del attrs[key]
+                break
+    return attrs
+
+
+def _parse_attrs(raw: str) -> dict[str, str]:
+    attrs: dict[str, str] = {}
+    for part in shlex.split(raw):
+        if part.startswith("#") and len(part) > 1:
+            attrs["id"] = part[1:]
+            continue
+        if "=" not in part:
+            attrs[part] = "true"
+            continue
+        key, value = part.split("=", 1)
+        attrs[key.strip()] = value.strip()
+    return attrs
+
+
+def _tags_from_attrs(attrs: dict[str, str]) -> list[str]:
+    raw = attrs.get("tags") or attrs.get("tag") or ""
+    return [tag.strip() for tag in re.split(r"[, ]+", raw) if tag.strip()]
+
+
+def _content_unit(
+    *,
+    kind: str,
+    unit_id: str,
+    text: str,
+    source_path: Path,
+    span: SourceSpan | None,
+    name: str | None,
+    metadata: dict[str, Any] | None = None,
+) -> ContentUnit:
+    return ContentUnit(
+        kind=kind,
+        unit_id=unit_id,
+        text=text,
+        source_path=str(source_path),
+        span=span,
+        name=name,
+        content_hash="sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest(),
+        metadata=metadata or {},
+    )
+
+
+def _heading_title_and_id(heading: Heading) -> tuple[str, str | None]:
+    match = _HEADING_ID_RE.match(heading.text.strip())
+    if not match:
+        return heading.text.strip(), None
+    return match.group("title").strip(), match.group("id")
+
+
+def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
+    count = used_ids.get(unit_id, 0) + 1
+    used_ids[unit_id] = count
+    return unit_id if count == 1 else f"{unit_id}-{count}"
+
+
+def _slug(value: str) -> str:
+    slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
+    slug = re.sub(r"-+", "-", slug).strip("-")
+    return slug or "unit"