generated from coulomb/repo-seed
extension for ref resolve, explode, implode, weave, tangle
This commit is contained in:
25
src/markitect_tool/reference/__init__.py
Normal file
25
src/markitect_tool/reference/__init__.py
Normal file
@@ -0,0 +1,25 @@
|
||||
"""Namespaced content reference resolution for Markdown artifacts."""
|
||||
|
||||
from markitect_tool.reference.engine import (
|
||||
ContentUnit,
|
||||
ReferenceAddress,
|
||||
ReferenceContext,
|
||||
ReferenceResolution,
|
||||
ReferenceResolutionError,
|
||||
SourceSpan,
|
||||
load_namespaces,
|
||||
parse_reference,
|
||||
resolve_reference,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ContentUnit",
|
||||
"ReferenceAddress",
|
||||
"ReferenceContext",
|
||||
"ReferenceResolution",
|
||||
"ReferenceResolutionError",
|
||||
"SourceSpan",
|
||||
"load_namespaces",
|
||||
"parse_reference",
|
||||
"resolve_reference",
|
||||
]
|
||||
626
src/markitect_tool/reference/engine.py
Normal file
626
src/markitect_tool/reference/engine.py
Normal file
@@ -0,0 +1,626 @@
|
||||
"""Reference parsing and resolution for Markdown content units."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import re
|
||||
import shlex
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from markdown_it import MarkdownIt
|
||||
|
||||
from markitect_tool.core import ContentBlock, Document, Heading, Section, parse_markdown
|
||||
from markitect_tool.query import InvalidQueryError, QueryMatch, query_document
|
||||
|
||||
|
||||
class ReferenceResolutionError(ValueError):
|
||||
"""Raised when a content reference cannot be resolved."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ReferenceAddress:
|
||||
"""Parsed content reference address.
|
||||
|
||||
Syntax is intentionally compact and Markdown-friendly:
|
||||
|
||||
- ``path/to/file.md``
|
||||
- ``std:clauses/payment.md``
|
||||
- ``std:clauses/payment.md#section:terms``
|
||||
- ``std:clauses/payment.md::sections[heading=Terms]``
|
||||
- ``#intro`` for a fragment in the current document
|
||||
"""
|
||||
|
||||
raw: str
|
||||
namespace: str | None = None
|
||||
address: str = ""
|
||||
fragment: str | None = None
|
||||
selector: str | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
key: value
|
||||
for key, value in asdict(self).items()
|
||||
if value is not None and value != ""
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ReferenceContext:
|
||||
"""Inputs used to resolve namespaced and relative content references."""
|
||||
|
||||
root: Path = Path(".")
|
||||
current_path: Path | None = None
|
||||
namespaces: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def from_document(
|
||||
cls,
|
||||
document: Document,
|
||||
*,
|
||||
root: str | Path = ".",
|
||||
current_path: str | Path | None = None,
|
||||
) -> "ReferenceContext":
|
||||
"""Build a reference context from document frontmatter."""
|
||||
|
||||
source_path = current_path or document.source_path
|
||||
return cls(
|
||||
root=Path(root),
|
||||
current_path=Path(source_path) if source_path else None,
|
||||
namespaces=load_namespaces(document.frontmatter),
|
||||
)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
data = {
|
||||
"root": str(self.root),
|
||||
"current_path": str(self.current_path) if self.current_path else None,
|
||||
"namespaces": self.namespaces,
|
||||
}
|
||||
return {key: value for key, value in data.items() if value is not None}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SourceSpan:
|
||||
"""Line span for a resolved unit in its source file."""
|
||||
|
||||
line_start: int | None = None
|
||||
line_end: int | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {key: value for key, value in asdict(self).items() if value is not None}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ContentUnit:
|
||||
"""One addressable content unit resolved from Markdown."""
|
||||
|
||||
kind: str
|
||||
unit_id: str
|
||||
text: str
|
||||
source_path: str
|
||||
span: SourceSpan | None = None
|
||||
name: str | None = None
|
||||
content_hash: str = ""
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
data = {
|
||||
"kind": self.kind,
|
||||
"unit_id": self.unit_id,
|
||||
"name": self.name,
|
||||
"source_path": self.source_path,
|
||||
"span": self.span.to_dict() if self.span else None,
|
||||
"content_hash": self.content_hash,
|
||||
"metadata": self.metadata or None,
|
||||
"text": self.text,
|
||||
}
|
||||
return {key: value for key, value in data.items() if value is not None}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ReferenceResolution:
|
||||
"""Resolved content reference and its dependency edge."""
|
||||
|
||||
reference: ReferenceAddress
|
||||
source_path: str
|
||||
target_path: str
|
||||
units: list[ContentUnit]
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"reference": self.reference.to_dict(),
|
||||
"source_path": self.source_path,
|
||||
"target_path": self.target_path,
|
||||
"count": len(self.units),
|
||||
"units": [unit.to_dict() for unit in self.units],
|
||||
}
|
||||
|
||||
|
||||
_NAMESPACE_RE = re.compile(r"^(?P<namespace>[A-Za-z][A-Za-z0-9_.-]*):(?P<address>.*)$")
|
||||
_HEADING_ID_RE = re.compile(r"^(?P<title>.*?)(?:\s+\{#(?P<id>[A-Za-z0-9_.:-]+)\})?$")
|
||||
_REGION_OPEN_RE = re.compile(r"<!--\s*mkt:region\s+(?P<attrs>.*?)\s*-->")
|
||||
_REGION_CLOSE_RE = re.compile(r"<!--\s*/mkt:region\s*-->")
|
||||
_FENCE_ATTRS_RE = re.compile(r"^(?P<language>[^\s{]+)?(?:\s+\{(?P<attrs>.*)\})?\s*$")
|
||||
|
||||
|
||||
def parse_reference(reference: str) -> ReferenceAddress:
|
||||
"""Parse a compact Markitect content reference."""
|
||||
|
||||
raw = reference.strip()
|
||||
if not raw:
|
||||
raise ReferenceResolutionError("Reference cannot be empty")
|
||||
|
||||
selector: str | None = None
|
||||
base = raw
|
||||
if "::" in base:
|
||||
base, selector = base.split("::", 1)
|
||||
selector = selector.strip()
|
||||
if not selector:
|
||||
raise ReferenceResolutionError(f"Reference selector is empty in `{reference}`")
|
||||
|
||||
fragment: str | None = None
|
||||
if "#" in base:
|
||||
base, fragment = base.split("#", 1)
|
||||
fragment = fragment.strip()
|
||||
if not fragment:
|
||||
raise ReferenceResolutionError(f"Reference fragment is empty in `{reference}`")
|
||||
|
||||
namespace: str | None = None
|
||||
address = base.strip()
|
||||
match = _NAMESPACE_RE.match(address)
|
||||
if match and "/" not in match.group("namespace") and "\\" not in match.group("namespace"):
|
||||
namespace = match.group("namespace")
|
||||
address = match.group("address").strip()
|
||||
|
||||
return ReferenceAddress(
|
||||
raw=raw,
|
||||
namespace=namespace,
|
||||
address=address,
|
||||
fragment=fragment,
|
||||
selector=selector,
|
||||
)
|
||||
|
||||
|
||||
def load_namespaces(frontmatter: dict[str, Any]) -> dict[str, str]:
|
||||
"""Load namespace mappings from Markdown frontmatter."""
|
||||
|
||||
raw_namespaces = frontmatter.get("namespaces", {})
|
||||
if raw_namespaces is None:
|
||||
return {}
|
||||
if not isinstance(raw_namespaces, dict):
|
||||
raise ReferenceResolutionError("Frontmatter `namespaces` must be a mapping")
|
||||
|
||||
namespaces: dict[str, str] = {}
|
||||
for raw_key, raw_value in raw_namespaces.items():
|
||||
key = str(raw_key).strip().rstrip(":")
|
||||
if not key:
|
||||
raise ReferenceResolutionError("Namespace keys cannot be empty")
|
||||
if not _NAMESPACE_RE.match(f"{key}:"):
|
||||
raise ReferenceResolutionError(f"Invalid namespace key `{raw_key}`")
|
||||
if not isinstance(raw_value, str):
|
||||
raise ReferenceResolutionError(f"Namespace `{key}` must map to a string path")
|
||||
value = raw_value.strip()
|
||||
if not value:
|
||||
raise ReferenceResolutionError(f"Namespace `{key}` cannot map to an empty path")
|
||||
namespaces[key] = value
|
||||
return namespaces
|
||||
|
||||
|
||||
def resolve_reference(
|
||||
reference: str | ReferenceAddress,
|
||||
*,
|
||||
context: ReferenceContext,
|
||||
) -> ReferenceResolution:
|
||||
"""Resolve a content reference to one or more content units."""
|
||||
|
||||
address = parse_reference(reference) if isinstance(reference, str) else reference
|
||||
root = context.root.resolve()
|
||||
source_path = context.current_path.resolve() if context.current_path else root
|
||||
target_path = _resolve_target_path(address, context, root, source_path)
|
||||
if not target_path.exists() or not target_path.is_file():
|
||||
raise ReferenceResolutionError(f"Referenced file not found: {target_path}")
|
||||
|
||||
markdown = target_path.read_text(encoding="utf-8")
|
||||
document = parse_markdown(markdown, source_path=str(target_path))
|
||||
|
||||
if address.selector and address.fragment:
|
||||
raise ReferenceResolutionError("Reference cannot use both fragment and selector")
|
||||
if address.selector:
|
||||
units = _units_from_selector(document, address.selector, target_path)
|
||||
elif address.fragment:
|
||||
units = _units_from_fragment(document, address.fragment, target_path, markdown)
|
||||
else:
|
||||
units = [_document_unit(document, target_path, markdown)]
|
||||
|
||||
if not units:
|
||||
raise ReferenceResolutionError(f"Reference `{address.raw}` did not match any content units")
|
||||
|
||||
return ReferenceResolution(
|
||||
reference=address,
|
||||
source_path=str(source_path),
|
||||
target_path=str(target_path),
|
||||
units=units,
|
||||
)
|
||||
|
||||
|
||||
def _resolve_target_path(
|
||||
address: ReferenceAddress,
|
||||
context: ReferenceContext,
|
||||
root: Path,
|
||||
source_path: Path,
|
||||
) -> Path:
|
||||
if address.namespace:
|
||||
if address.namespace not in context.namespaces:
|
||||
raise ReferenceResolutionError(f"Unknown namespace `{address.namespace}`")
|
||||
namespace_target = _path_from_namespace(context.namespaces[address.namespace], root)
|
||||
candidate = namespace_target / address.address if namespace_target.is_dir() else namespace_target
|
||||
elif address.address:
|
||||
base_dir = source_path.parent if source_path.is_file() else root
|
||||
candidate = Path(address.address)
|
||||
candidate = candidate if candidate.is_absolute() else base_dir / candidate
|
||||
elif context.current_path:
|
||||
candidate = context.current_path
|
||||
else:
|
||||
raise ReferenceResolutionError("Pathless references require a current document")
|
||||
|
||||
resolved = candidate.resolve()
|
||||
try:
|
||||
resolved.relative_to(root)
|
||||
except ValueError as exc:
|
||||
raise ReferenceResolutionError(f"Reference escapes root: {address.raw}") from exc
|
||||
return resolved
|
||||
|
||||
|
||||
def _path_from_namespace(raw_path: str, root: Path) -> Path:
|
||||
path = Path(raw_path)
|
||||
if not path.is_absolute():
|
||||
path = root / path
|
||||
return path.resolve()
|
||||
|
||||
|
||||
def _units_from_selector(
|
||||
document: Document,
|
||||
selector: str,
|
||||
target_path: Path,
|
||||
) -> list[ContentUnit]:
|
||||
try:
|
||||
matches = query_document(document, selector)
|
||||
except InvalidQueryError as exc:
|
||||
raise ReferenceResolutionError(str(exc)) from exc
|
||||
return [_unit_from_query_match(match, target_path) for match in matches]
|
||||
|
||||
|
||||
def _units_from_fragment(
|
||||
document: Document,
|
||||
fragment: str,
|
||||
target_path: Path,
|
||||
markdown: str,
|
||||
) -> list[ContentUnit]:
|
||||
kind, _, value = fragment.partition(":")
|
||||
if not value:
|
||||
kind, value = "id", kind
|
||||
lookup = _slug(value)
|
||||
|
||||
if kind == "document":
|
||||
return [_document_unit(document, target_path, markdown)]
|
||||
if kind == "id":
|
||||
for units in [
|
||||
_section_units(document, target_path),
|
||||
_region_units(markdown, target_path),
|
||||
_fenced_block_units(markdown, target_path),
|
||||
_heading_units(document, target_path),
|
||||
]:
|
||||
matches = [
|
||||
unit for unit in units if unit.unit_id == lookup or _slug(unit.name or "") == lookup
|
||||
]
|
||||
if matches:
|
||||
return matches
|
||||
return []
|
||||
if kind in {"id", "section"}:
|
||||
sections = _section_units(document, target_path)
|
||||
return [unit for unit in sections if unit.unit_id == lookup or _slug(unit.name or "") == lookup]
|
||||
if kind == "heading":
|
||||
headings = _heading_units(document, target_path)
|
||||
return [unit for unit in headings if unit.unit_id == lookup or _slug(unit.name or "") == lookup]
|
||||
if kind == "block":
|
||||
return _block_fragment_units(document, target_path, value)
|
||||
if kind == "region":
|
||||
return [unit for unit in _region_units(markdown, target_path) if unit.unit_id == lookup]
|
||||
if kind == "fence":
|
||||
return [unit for unit in _fenced_block_units(markdown, target_path) if unit.unit_id == lookup]
|
||||
if kind == "tag":
|
||||
return [
|
||||
unit
|
||||
for unit in _region_units(markdown, target_path) + _fenced_block_units(markdown, target_path)
|
||||
if lookup in {_slug(tag) for tag in unit.metadata.get("tags", [])}
|
||||
]
|
||||
if kind == "line":
|
||||
return _line_range_units(markdown, target_path, value)
|
||||
raise ReferenceResolutionError(f"Unsupported reference fragment kind `{kind}`")
|
||||
|
||||
|
||||
def _document_unit(document: Document, target_path: Path, markdown: str) -> ContentUnit:
|
||||
unit_id = _slug(str(document.frontmatter.get("id") or target_path.stem))
|
||||
return _content_unit(
|
||||
kind="document",
|
||||
unit_id=unit_id,
|
||||
text=markdown,
|
||||
source_path=target_path,
|
||||
span=SourceSpan(1, len(markdown.splitlines())),
|
||||
name=str(document.frontmatter.get("title") or target_path.stem),
|
||||
metadata={"frontmatter": document.frontmatter},
|
||||
)
|
||||
|
||||
|
||||
def _unit_from_query_match(match: QueryMatch, target_path: Path) -> ContentUnit:
|
||||
unit_id = _slug(match.path.replace("$.", "").replace("[", "-").replace("]", ""))
|
||||
name = match.text.splitlines()[0].lstrip("# ").strip() if match.text else match.kind
|
||||
return _content_unit(
|
||||
kind=match.kind,
|
||||
unit_id=unit_id,
|
||||
text=match.text if match.text is not None else str(match.value),
|
||||
source_path=target_path,
|
||||
span=SourceSpan(match.line, None),
|
||||
name=name,
|
||||
metadata={"query_path": match.path, "value": match.value},
|
||||
)
|
||||
|
||||
|
||||
def _section_units(document: Document, target_path: Path) -> list[ContentUnit]:
|
||||
used_ids: dict[str, int] = {}
|
||||
return [
|
||||
_section_unit(section, target_path, used_ids)
|
||||
for section in document.sections
|
||||
]
|
||||
|
||||
|
||||
def _section_unit(
|
||||
section: Section,
|
||||
target_path: Path,
|
||||
used_ids: dict[str, int],
|
||||
) -> ContentUnit:
|
||||
title, explicit_id = _heading_title_and_id(section.heading)
|
||||
unit_id = _dedupe_id(_slug(explicit_id or title), used_ids)
|
||||
line_end = section.blocks[-1].line_end if section.blocks else section.heading.line
|
||||
lines = [f"{'#' * section.heading.level} {section.heading.text}"]
|
||||
for block in section.blocks:
|
||||
if block.text:
|
||||
lines.extend(["", block.text])
|
||||
return _content_unit(
|
||||
kind="section",
|
||||
unit_id=unit_id,
|
||||
text="\n".join(lines).strip(),
|
||||
source_path=target_path,
|
||||
span=SourceSpan(section.heading.line, line_end),
|
||||
name=title,
|
||||
metadata={"heading_level": section.heading.level},
|
||||
)
|
||||
|
||||
|
||||
def _heading_units(document: Document, target_path: Path) -> list[ContentUnit]:
|
||||
used_ids: dict[str, int] = {}
|
||||
units: list[ContentUnit] = []
|
||||
for heading in document.headings:
|
||||
title, explicit_id = _heading_title_and_id(heading)
|
||||
unit_id = _dedupe_id(_slug(explicit_id or title), used_ids)
|
||||
units.append(
|
||||
_content_unit(
|
||||
kind="heading",
|
||||
unit_id=unit_id,
|
||||
text=f"{'#' * heading.level} {heading.text}",
|
||||
source_path=target_path,
|
||||
span=SourceSpan(heading.line, heading.line),
|
||||
name=title,
|
||||
metadata={"heading_level": heading.level},
|
||||
)
|
||||
)
|
||||
return units
|
||||
|
||||
|
||||
def _block_fragment_units(
|
||||
document: Document,
|
||||
target_path: Path,
|
||||
value: str,
|
||||
) -> list[ContentUnit]:
|
||||
blocks = _block_units(document.blocks, target_path)
|
||||
if value.isdigit():
|
||||
index = int(value)
|
||||
return [blocks[index]] if 0 <= index < len(blocks) else []
|
||||
lookup = _slug(value)
|
||||
return [unit for unit in blocks if unit.unit_id == lookup]
|
||||
|
||||
|
||||
def _block_units(blocks: list[ContentBlock], target_path: Path) -> list[ContentUnit]:
|
||||
used_ids: dict[str, int] = {}
|
||||
units: list[ContentUnit] = []
|
||||
for index, block in enumerate(blocks):
|
||||
base_id = f"{block.type}-{block.line_start or index}"
|
||||
units.append(
|
||||
_content_unit(
|
||||
kind=block.type,
|
||||
unit_id=_dedupe_id(_slug(base_id), used_ids),
|
||||
text=block.text,
|
||||
source_path=target_path,
|
||||
span=SourceSpan(block.line_start, block.line_end),
|
||||
name=block.type,
|
||||
metadata={"block_index": index},
|
||||
)
|
||||
)
|
||||
return units
|
||||
|
||||
|
||||
def _region_units(markdown: str, target_path: Path) -> list[ContentUnit]:
|
||||
lines = markdown.splitlines()
|
||||
units: list[ContentUnit] = []
|
||||
open_region: tuple[int, str, list[str]] | None = None
|
||||
|
||||
for index, line in enumerate(lines, start=1):
|
||||
open_match = _REGION_OPEN_RE.search(line)
|
||||
close_match = _REGION_CLOSE_RE.search(line)
|
||||
if open_match and open_region is not None:
|
||||
raise ReferenceResolutionError("Nested mkt:region blocks are not supported")
|
||||
if close_match:
|
||||
if open_region is None:
|
||||
raise ReferenceResolutionError("Region close marker has no matching open marker")
|
||||
start_line, region_id, tags = open_region
|
||||
content_lines = lines[start_line:index - 1]
|
||||
units.append(
|
||||
_content_unit(
|
||||
kind="region",
|
||||
unit_id=_slug(region_id),
|
||||
text="\n".join(content_lines).strip(),
|
||||
source_path=target_path,
|
||||
span=SourceSpan(start_line, index),
|
||||
name=region_id,
|
||||
metadata={"tags": tags},
|
||||
)
|
||||
)
|
||||
open_region = None
|
||||
continue
|
||||
if open_match:
|
||||
attrs = _parse_attrs(open_match.group("attrs"))
|
||||
region_id = attrs.get("id")
|
||||
if not region_id:
|
||||
raise ReferenceResolutionError("Region marker requires an id attribute")
|
||||
open_region = (index, region_id, _tags_from_attrs(attrs))
|
||||
|
||||
if open_region is not None:
|
||||
raise ReferenceResolutionError("Region open marker has no matching close marker")
|
||||
return units
|
||||
|
||||
|
||||
def _fenced_block_units(markdown: str, target_path: Path) -> list[ContentUnit]:
|
||||
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
|
||||
units: list[ContentUnit] = []
|
||||
used_ids: dict[str, int] = {}
|
||||
for index, token in enumerate(parser.parse(markdown)):
|
||||
if token.type != "fence":
|
||||
continue
|
||||
attrs = _parse_fence_info(token.info)
|
||||
unit_id = attrs.get("id")
|
||||
if not unit_id:
|
||||
continue
|
||||
line_start = token.map[0] + 1 if token.map else None
|
||||
line_end = token.map[1] if token.map else None
|
||||
units.append(
|
||||
_content_unit(
|
||||
kind="fenced_block",
|
||||
unit_id=_dedupe_id(_slug(unit_id), used_ids),
|
||||
text=token.content,
|
||||
source_path=target_path,
|
||||
span=SourceSpan(line_start, line_end),
|
||||
name=unit_id,
|
||||
metadata={
|
||||
"language": attrs.get("language"),
|
||||
"tags": _tags_from_attrs(attrs),
|
||||
"attrs": {
|
||||
key: value
|
||||
for key, value in attrs.items()
|
||||
if key not in {"id", "language", "tag", "tags"}
|
||||
},
|
||||
"block_index": index,
|
||||
},
|
||||
)
|
||||
)
|
||||
return units
|
||||
|
||||
|
||||
def _line_range_units(markdown: str, target_path: Path, value: str) -> list[ContentUnit]:
|
||||
match = re.match(r"^(?P<start>\d+)(?:-(?P<end>\d+))?$", value)
|
||||
if not match:
|
||||
raise ReferenceResolutionError("Line fragments must use `line:start` or `line:start-end`")
|
||||
start = int(match.group("start"))
|
||||
end = int(match.group("end") or start)
|
||||
lines = markdown.splitlines()
|
||||
if start < 1 or end < start or end > len(lines):
|
||||
return []
|
||||
text = "\n".join(lines[start - 1:end])
|
||||
return [
|
||||
_content_unit(
|
||||
kind="line_range",
|
||||
unit_id=f"line-{start}-{end}",
|
||||
text=text,
|
||||
source_path=target_path,
|
||||
span=SourceSpan(start, end),
|
||||
name=f"lines {start}-{end}",
|
||||
metadata={},
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _parse_fence_info(info: str) -> dict[str, str]:
|
||||
match = _FENCE_ATTRS_RE.match(info.strip())
|
||||
if not match:
|
||||
return {"language": info.strip()} if info.strip() else {}
|
||||
attrs = _parse_attrs(match.group("attrs") or "")
|
||||
language = match.group("language")
|
||||
if language:
|
||||
attrs["language"] = language
|
||||
if "id" not in attrs and attrs:
|
||||
for key in list(attrs):
|
||||
if key.startswith("#"):
|
||||
attrs["id"] = key[1:]
|
||||
del attrs[key]
|
||||
break
|
||||
return attrs
|
||||
|
||||
|
||||
def _parse_attrs(raw: str) -> dict[str, str]:
|
||||
attrs: dict[str, str] = {}
|
||||
for part in shlex.split(raw):
|
||||
if part.startswith("#") and len(part) > 1:
|
||||
attrs["id"] = part[1:]
|
||||
continue
|
||||
if "=" not in part:
|
||||
attrs[part] = "true"
|
||||
continue
|
||||
key, value = part.split("=", 1)
|
||||
attrs[key.strip()] = value.strip()
|
||||
return attrs
|
||||
|
||||
|
||||
def _tags_from_attrs(attrs: dict[str, str]) -> list[str]:
|
||||
raw = attrs.get("tags") or attrs.get("tag") or ""
|
||||
return [tag.strip() for tag in re.split(r"[, ]+", raw) if tag.strip()]
|
||||
|
||||
|
||||
def _content_unit(
|
||||
*,
|
||||
kind: str,
|
||||
unit_id: str,
|
||||
text: str,
|
||||
source_path: Path,
|
||||
span: SourceSpan | None,
|
||||
name: str | None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> ContentUnit:
|
||||
return ContentUnit(
|
||||
kind=kind,
|
||||
unit_id=unit_id,
|
||||
text=text,
|
||||
source_path=str(source_path),
|
||||
span=span,
|
||||
name=name,
|
||||
content_hash="sha256:" + hashlib.sha256(text.encode("utf-8")).hexdigest(),
|
||||
metadata=metadata or {},
|
||||
)
|
||||
|
||||
|
||||
def _heading_title_and_id(heading: Heading) -> tuple[str, str | None]:
|
||||
match = _HEADING_ID_RE.match(heading.text.strip())
|
||||
if not match:
|
||||
return heading.text.strip(), None
|
||||
return match.group("title").strip(), match.group("id")
|
||||
|
||||
|
||||
def _dedupe_id(unit_id: str, used_ids: dict[str, int]) -> str:
|
||||
count = used_ids.get(unit_id, 0) + 1
|
||||
used_ids[unit_id] = count
|
||||
return unit_id if count == 1 else f"{unit_id}-{count}"
|
||||
|
||||
|
||||
def _slug(value: str) -> str:
|
||||
slug = re.sub(r"[^a-z0-9_.:-]+", "-", value.strip().lower())
|
||||
slug = re.sub(r"-+", "-", slug).strip("-")
|
||||
return slug or "unit"
|
||||
Reference in New Issue
Block a user