feat: WP-0003 complete — LEVEL3 advanced features + error framework

Implements full LEVEL3 feature set: cross-references (xref.py), numbered
figures (figures.py), auto-diagrams (diagrams.py), bibliography/citations
(bibliography.py), LEVEL3 capability detection (level3.py), and structured
error/warning records (errors.py). Builder, importer, and differ updated for
LEVEL3 round-trip support. REST and MCP interfaces updated with structured
warning records. 259 tests passing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-16 10:51:38 +00:00
parent 760047b82b
commit ac442ea41f
26 changed files with 3713 additions and 74 deletions

View File

@@ -0,0 +1,208 @@
"""Bibliography and citation support for LEVEL3 markidocx (FR-535, FR-536, FR-542).
Handles the round-trip of inline citations and Bibliography/References sections
between Markdown and DOCX.
Markdown syntax:
Inline citation: [@key]
References section:
## References
- [@key]: Author. *Title*. Year.
DOCX representation:
Inline: [key] (plain text marker)
References section: "References" heading + plain text entries
Source-intent markers embedded for importer restoration.
"""
from __future__ import annotations
import re
# Markdown citation patterns
CITATION_RE = re.compile(r"\[@([\w:.-]+)\]")
CITATION_ENTRY_RE = re.compile(r"^-\s+\[@([\w:.-]+)\]:\s+(.+)$")
REFERENCES_HEADING_RE = re.compile(r"^#{1,3}\s+References\s*$", re.MULTILINE)
# DOCX markers
CITATION_MARKER_PREFIX = "citation:"
REFERENCES_SECTION_MARKER = "references-section:"
def has_citations(text: str) -> bool:
"""Return True if *text* contains inline citations."""
return bool(CITATION_RE.search(text))
def render_inline_citations(text: str) -> str:
"""Replace [@key] markers with [key] for DOCX embedding.
Returns the transformed text suitable for DOCX paragraph text.
"""
return CITATION_RE.sub(lambda m: f"[{m.group(1)}]", text)
def extract_citation_keys(text: str) -> list[str]:
"""Extract all citation keys from *text*."""
return CITATION_RE.findall(text)
def is_references_heading(text: str) -> bool:
"""Return True if *text* is a References section heading."""
return bool(REFERENCES_HEADING_RE.match(text.strip()))
def parse_reference_entry(text: str) -> tuple[str, str] | None:
"""Parse a reference list entry.
Returns (key, entry_text) or None.
"""
m = CITATION_ENTRY_RE.match(text.strip())
if m:
return m.group(1), m.group(2)
return None
def extract_references_section(md_text: str) -> tuple[list[tuple[str, str]], str]:
"""Extract the references section from Markdown text.
Returns (entries, text_without_references_section).
entries: list of (key, entry_text)
"""
# Find the References heading
m = REFERENCES_HEADING_RE.search(md_text)
if not m:
return [], md_text
refs_start = m.start()
entries: list[tuple[str, str]] = []
# Collect entries after the heading
rest = md_text[m.end():].strip()
for line in rest.split("\n"):
line = line.strip()
if not line:
continue
parsed = parse_reference_entry(line)
if parsed:
entries.append(parsed)
elif line.startswith("#"):
# New heading — stop collecting
break
text_without = md_text[:refs_start].rstrip()
return entries, text_without
# ---------------------------------------------------------------------------
# Builder helpers
# ---------------------------------------------------------------------------
BIBLIOGRAPHY_SECTION_HEADING = "References"
BIBLIOGRAPHY_MARKER = "bibliography-section-start"
def render_citation_text(text: str) -> str:
"""Return citation text for DOCX embedding.
[@key] is kept as-is in the DOCX paragraph text so the importer
can restore it without ambiguity.
"""
return text # [@key] → [@key] (no transformation needed)
def render_references_section(doc, entries: list[tuple[str, str]]) -> None:
"""Add a References section to *doc* (FR-535).
Args:
doc: python-docx Document
entries: list of (key, entry_text)
"""
# Section heading
try:
doc.add_heading(BIBLIOGRAPHY_SECTION_HEADING, level=2)
except Exception:
doc.add_paragraph(BIBLIOGRAPHY_SECTION_HEADING, style="Normal")
# Bibliography marker so importer can identify the section
marker_para = doc.add_paragraph(style="Normal")
from docx.shared import Pt
marker_run = marker_para.add_run(BIBLIOGRAPHY_MARKER)
marker_run.font.size = Pt(1)
# Entries — keep [@key] format directly in DOCX text for round-trip fidelity
for key, entry_text in entries:
para = doc.add_paragraph(style="Normal")
para.add_run(f"- [@{key}]: {entry_text}")
# ---------------------------------------------------------------------------
# Importer helpers
# ---------------------------------------------------------------------------
DOCX_CITATION_RE = re.compile(r"\[([^\]@]+)\](?!\[)") # [key] without @, not followed by [
BIBLIOGRAPHY_MARKER_PARA_RE = re.compile(r"^bibliography-section-start$")
BIBLIOGRAPHY_ENTRY_RE = re.compile(r"^-\s+\[@([\w:.-]+)\]:\s+(.+)$")
def restore_citations_in_text(text: str) -> str:
"""Return imported text with citations already in [@key] form (no-op).
Since builder now embeds [@key] directly in DOCX, no restoration needed.
"""
return text
def is_bibliography_marker(text: str) -> bool:
return BIBLIOGRAPHY_MARKER_PARA_RE.match(text.strip()) is not None
def is_bibliography_entry(text: str) -> bool:
return bool(BIBLIOGRAPHY_ENTRY_RE.match(text.strip()))
# ---------------------------------------------------------------------------
# Differ helpers
# ---------------------------------------------------------------------------
def compare_citations(
original: str,
reimported: str,
preserved: list[str],
degraded: list[str],
broken: list[str],
warning_records: list | None = None,
) -> None:
"""Compare citation markers and reference entries (FR-536, FR-542)."""
orig_keys = set(extract_citation_keys(original))
reim_keys = set(extract_citation_keys(reimported))
for key in orig_keys:
if key in reim_keys:
preserved.append(f"citation:[@{key}]")
else:
broken.append(f"citation:missing '[@{key}]'")
if warning_records is not None:
from markidocx.errors import Severity, WarningRecord
warning_records.append(
WarningRecord(
severity=Severity.WARNING,
reason="citation-ambiguity",
construct=f"@{key}",
)
)
# References section
orig_entries, _ = extract_references_section(original)
reim_entries, _ = extract_references_section(reimported)
orig_ref_keys = {k for k, _ in orig_entries}
reim_ref_keys = {k for k, _ in reim_entries}
for key in orig_ref_keys:
if key in reim_ref_keys:
preserved.append(f"reference-entry:{key}")
else:
degraded.append(f"reference-entry:lost '{key}'")

View File

@@ -9,6 +9,7 @@ import mistune
from docx.document import Document as DocxDocument
from docx.shared import Pt, RGBColor
from markidocx.errors import OutputState, Severity, WarningRecord
from markidocx.manifest import FeatureLevel, Manifest
from markidocx.templates import FamilyRegistry
@@ -19,8 +20,16 @@ class BuildResult:
output_path: Path
family: str
feature_level: str
warnings: list[str] = field(default_factory=list)
warning_records: list[WarningRecord] = field(default_factory=list)
errors: list[str] = field(default_factory=list)
output_state: OutputState = OutputState.FINAL
partial_level3: bool = False
missing_coverage: list[str] = field(default_factory=list)
@property
def warnings(self) -> list[str]:
"""Backward-compatible string view of warning_records."""
return [str(w) for w in self.warning_records]
def build_document(manifest: Manifest) -> BuildResult:
@@ -28,8 +37,27 @@ def build_document(manifest: Manifest) -> BuildResult:
Returns a BuildResult regardless of success/failure.
"""
warnings: list[str] = []
warning_records: list[WarningRecord] = []
errors: list[str] = []
partial_level3 = False
missing_coverage: list[str] = []
# For LEVEL3 projects, check external dependencies (FR-538, FR-539)
if manifest.project.feature_level == FeatureLevel.LEVEL3:
from markidocx.level3 import check_level3_support
support = check_level3_support()
if support.partial:
partial_level3 = True
missing_coverage = support.missing_coverage
for area in support.missing_coverage:
warning_records.append(
WarningRecord(
severity=Severity.WARNING,
reason="processor-dependency-unavailable",
construct=area,
)
)
# Compose all source files into one Markdown string
parts: list[str] = []
@@ -48,11 +76,18 @@ def build_document(manifest: Manifest) -> BuildResult:
core_props.author = str(manifest.metadata["author"])
# Parse and render tokens into the document
unsupported: list[str] = []
_render_markdown(doc, markdown_text, manifest.project.feature_level, warnings, unsupported)
_render_markdown(
doc,
markdown_text,
manifest.project.feature_level,
warning_records,
)
for item in unsupported:
warnings.append(f"Unsupported construct skipped: {item}")
# Determine output state
has_warnings = bool(warning_records)
output_state = OutputState.PARTIAL if partial_level3 else (
OutputState.FINAL if not has_warnings else OutputState.FINAL
)
# Ensure output dir exists
manifest.output_dir.mkdir(parents=True, exist_ok=True)
@@ -64,8 +99,11 @@ def build_document(manifest: Manifest) -> BuildResult:
output_path=output_path,
family=manifest.project.family,
feature_level=manifest.project.feature_level.value,
warnings=warnings,
warning_records=warning_records,
errors=errors,
output_state=output_state,
partial_level3=partial_level3,
missing_coverage=missing_coverage,
)
@@ -77,13 +115,45 @@ def _render_markdown(
doc: DocxDocument,
text: str,
feature_level: FeatureLevel,
warnings: list[str],
unsupported: list[str],
warning_records: list[WarningRecord],
) -> None:
"""Parse *text* as Markdown and append elements to *doc*."""
tokens = _tokenise(text)
# For LEVEL3, extract references section before tokenising
ref_entries: list[tuple[str, str]] = []
body_text = text
if feature_level == FeatureLevel.LEVEL3:
from markidocx.bibliography import extract_references_section
ref_entries, body_text = extract_references_section(text)
tokens = _tokenise(body_text)
# Pre-compute known anchors for cross-ref validation (LEVEL3 only)
known_anchors: set[str] = set()
if feature_level == FeatureLevel.LEVEL3:
from markidocx.xref import extract_anchors
known_anchors = extract_anchors(body_text)
bookmark_counter = [0] # mutable int for nested calls
figure_counter = [0] # auto-incrementing figure number
for token in tokens:
_render_token(doc, token, feature_level, warnings, unsupported)
_render_token(
doc,
token,
feature_level,
warning_records,
known_anchors,
bookmark_counter,
figure_counter,
)
# Render references section at the end (LEVEL3 only) (FR-535)
if feature_level == FeatureLevel.LEVEL3 and ref_entries:
from markidocx.bibliography import render_references_section
render_references_section(doc, ref_entries)
def _tokenise(text: str) -> list[dict]: # type: ignore[type-arg]
@@ -99,23 +169,74 @@ def _render_token(
doc: DocxDocument,
token: dict,
feature_level: FeatureLevel,
warnings: list[str],
unsupported: list[str],
warning_records: list[WarningRecord],
known_anchors: set[str] | None = None,
bookmark_counter: list[int] | None = None,
figure_counter: list[int] | None = None,
) -> None:
token_type = token.get("type", "")
if known_anchors is None:
known_anchors = set()
if bookmark_counter is None:
bookmark_counter = [0]
if figure_counter is None:
figure_counter = [0]
if token_type == "heading":
level = token.get("attrs", {}).get("level", 1)
text = _extract_text(token.get("children", []))
raw_text = _extract_text(token.get("children", []))
if feature_level == FeatureLevel.LEVEL3:
from markidocx.xref import (
add_bookmark_to_paragraph,
extract_anchor_from_heading,
)
clean_text, anchor = extract_anchor_from_heading(raw_text)
else:
clean_text, anchor = raw_text, None
try:
doc.add_heading(text, level=level)
para = doc.add_heading(clean_text, level=level)
except Exception:
doc.add_paragraph(text, style="Normal")
para = doc.add_paragraph(clean_text, style="Normal")
if anchor:
add_bookmark_to_paragraph(para, anchor, bookmark_counter[0])
bookmark_counter[0] += 1
elif token_type == "paragraph":
text = _extract_text(token.get("children", []))
para = doc.add_paragraph(style="Normal")
_add_inline_runs(para, token.get("children", []))
raw_text = _extract_text_with_image_syntax(token.get("children", []))
if feature_level == FeatureLevel.LEVEL3:
from markidocx.figures import is_figure_paragraph, parse_figure
from markidocx.xref import has_xref_links, render_paragraph_with_xrefs
if is_figure_paragraph(raw_text):
parsed = parse_figure(raw_text)
if parsed:
caption, path, label = parsed
figure_counter[0] += 1
from markidocx.figures import render_figure
render_figure(doc, caption, path, label, figure_counter[0])
else:
para = doc.add_paragraph(style="Normal")
_add_inline_runs(para, token.get("children", []))
elif has_xref_links(raw_text):
para = doc.add_paragraph(style="Normal")
render_paragraph_with_xrefs(para, raw_text, known_anchors)
else:
para = doc.add_paragraph(style="Normal")
from markidocx.bibliography import has_citations, render_citation_text
if has_citations(raw_text):
para.add_run(render_citation_text(raw_text))
else:
_add_inline_runs(para, token.get("children", []))
else:
para = doc.add_paragraph(style="Normal")
_add_inline_runs(para, token.get("children", []))
elif token_type == "list":
ordered = token.get("attrs", {}).get("ordered", False)
@@ -135,10 +256,23 @@ def _render_token(
elif token_type == "block_code":
code = token.get("raw", "")
para = doc.add_paragraph(style="Normal")
run = para.add_run(code)
run.font.name = "Courier New"
run.font.size = Pt(9)
info = (token.get("attrs", {}) or {}).get("info", "") or ""
if feature_level == FeatureLevel.LEVEL3:
from markidocx.diagrams import is_diagram_info, render_diagram_block
if is_diagram_info(info):
render_diagram_block(doc, info.strip().lower(), code, warning_records)
else:
para = doc.add_paragraph(style="Normal")
run = para.add_run(code)
run.font.name = "Courier New"
run.font.size = Pt(9)
else:
para = doc.add_paragraph(style="Normal")
run = para.add_run(code)
run.font.name = "Courier New"
run.font.size = Pt(9)
elif token_type == "block_quote":
children = token.get("children", [])
@@ -151,14 +285,27 @@ def _render_token(
doc.add_paragraph("" * 20, style="Normal")
elif token_type in ("html_block", "raw_html"):
unsupported.append(f"html ({token_type})")
warning_records.append(
WarningRecord(
severity=Severity.WARNING,
reason="unsupported-construct",
construct=f"html ({token_type})",
)
)
elif token_type == "blank_line":
pass # ignore blank lines
else:
# Unknown token — surface as unsupported (FR-508)
unsupported.append(token_type)
# Unknown token — surface as unsupported (FR-508, FR-1203)
if token_type:
warning_records.append(
WarningRecord(
severity=Severity.WARNING,
reason="unsupported-construct",
construct=token_type,
)
)
def _render_table(doc: DocxDocument, token: dict) -> None:
@@ -186,6 +333,26 @@ def _render_table(doc: DocxDocument, token: dict) -> None:
run.bold = True
def _extract_text_with_image_syntax(children: list[dict]) -> str:
"""Extract text from token children, reconstructing image MD syntax for figures."""
parts: list[str] = []
for child in children:
child_type = child.get("type", "")
if child_type == "image":
caption = _extract_text(child.get("children", []))
url = child.get("attrs", {}).get("url", "")
parts.append(f"![{caption}]({url})")
elif child_type == "text":
parts.append(child.get("raw", ""))
elif child_type in ("strong", "emphasis", "codespan", "link"):
parts.append(_extract_text(child.get("children", [])))
elif child.get("raw"):
parts.append(child["raw"])
elif child.get("children"):
parts.append(_extract_text_with_image_syntax(child["children"]))
return "".join(parts)
def _extract_text(children: list[dict]) -> str:
"""Recursively extract plain text from a token children list."""
parts: list[str] = []

190
src/markidocx/diagrams.py Normal file
View File

@@ -0,0 +1,190 @@
"""Auto-diagram support for LEVEL3 markidocx (FR-533, FR-534).
Handles fenced diagram source blocks (mermaid, graphviz, plantuml) in the
Markdown ↔ DOCX round trip.
Source-intent preservation:
When a renderer is unavailable, diagram source is embedded as a verbatim
code block and a source-intent marker paragraph is added so the importer
can restore the fenced block. No source is silently discarded (FR-1205).
"""
from __future__ import annotations
import re
import shutil
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from docx.document import Document as DocxDocument
# Diagram types recognised as LEVEL3 auto-diagram sources
DIAGRAM_TYPES: frozenset[str] = frozenset({"mermaid", "graphviz", "plantuml"})
# Renderer → CLI command mapping
_RENDERER_COMMANDS: dict[str, str] = {
"mermaid": "mmdc",
"graphviz": "dot",
"plantuml": "plantuml",
}
# Marker prefix stored in DOCX paragraph to preserve source intent (FR-534)
DIAGRAM_SOURCE_MARKER_PREFIX = "diagram-source:"
DIAGRAM_SOURCE_MARKER_RE = re.compile(
r"^diagram-source:(\w+)\n(.*)", re.DOTALL
)
def is_diagram_info(info: str) -> bool:
"""Return True if *info* is a recognised diagram type."""
return (info or "").strip().lower() in DIAGRAM_TYPES
def check_renderer(diagram_type: str) -> bool:
"""Return True if the required renderer for *diagram_type* is available."""
cmd = _RENDERER_COMMANDS.get(diagram_type.lower())
return bool(cmd and shutil.which(cmd))
def render_diagram_block(
doc: DocxDocument,
diagram_type: str,
source: str,
warning_records: list,
) -> None:
"""Render a diagram fenced block into *doc* (FR-533, FR-534).
If a renderer is available → renders to PNG and embeds the image.
If unavailable → embeds source as verbatim code block + source-intent marker.
Never silently discards source (FR-1205).
"""
from docx.shared import Pt
from markidocx.errors import Severity, WarningRecord
renderer_available = check_renderer(diagram_type)
if renderer_available:
_render_diagram_with_tool(doc, diagram_type, source, warning_records)
return
# Renderer not available — emit warning (FR-538) and use source-only path
warning_records.append(
WarningRecord(
severity=Severity.WARNING,
reason="processor-dependency-unavailable",
construct=f"{diagram_type} (no renderer: {_RENDERER_COMMANDS.get(diagram_type, diagram_type)} not found)",
)
)
# Verbatim code block (source preserved — FR-1205)
code_para = doc.add_paragraph(style="Normal")
run = code_para.add_run(f"```{diagram_type}\n{source}\n```")
run.font.name = "Courier New"
run.font.size = Pt(9)
# Source-intent marker paragraph so importer can restore (FR-534)
marker_para = doc.add_paragraph(style="Normal")
marker_run = marker_para.add_run(f"{DIAGRAM_SOURCE_MARKER_PREFIX}{diagram_type}\n{source}")
marker_run.font.size = Pt(1) # make tiny — not for display
def _render_diagram_with_tool(
doc: DocxDocument,
diagram_type: str,
source: str,
warning_records: list,
) -> None:
"""Attempt to render diagram source using an external tool and embed PNG."""
import subprocess
import tempfile
from pathlib import Path
from docx.shared import Inches, Pt
from markidocx.errors import Severity, WarningRecord
cmd = _RENDERER_COMMANDS[diagram_type]
try:
with tempfile.TemporaryDirectory() as tmp:
tmp_path = Path(tmp)
src_file = tmp_path / f"diagram.{diagram_type[:3]}"
png_file = tmp_path / "diagram.png"
src_file.write_text(source, encoding="utf-8")
if diagram_type == "mermaid":
args = [cmd, "-i", str(src_file), "-o", str(png_file)]
elif diagram_type == "graphviz":
args = [cmd, "-Tpng", str(src_file), "-o", str(png_file)]
else: # plantuml
args = [cmd, "-tpng", str(src_file), "-o", str(tmp_path)]
png_file = tmp_path / f"diagram.{diagram_type[:3]}.png"
subprocess.run(args, capture_output=True, timeout=30)
if png_file.exists():
para = doc.add_paragraph(style="Normal")
run = para.add_run()
run.add_picture(str(png_file), width=Inches(5))
# Source-intent marker for round-trip (FR-534)
marker_para = doc.add_paragraph(style="Normal")
marker_run = marker_para.add_run(
f"{DIAGRAM_SOURCE_MARKER_PREFIX}{diagram_type}\n{source}"
)
marker_run.font.size = Pt(1)
return
except Exception as exc:
warning_records.append(
WarningRecord(
severity=Severity.WARNING,
reason="diagram-render-failed",
construct=f"{diagram_type}: {exc}",
)
)
# Fallback: source-only path
from docx.shared import Pt
code_para = doc.add_paragraph(style="Normal")
run = code_para.add_run(f"```{diagram_type}\n{source}\n```")
run.font.name = "Courier New"
run.font.size = Pt(9)
marker_para = doc.add_paragraph(style="Normal")
marker_run = marker_para.add_run(
f"{DIAGRAM_SOURCE_MARKER_PREFIX}{diagram_type}\n{source}"
)
from docx.shared import Pt
marker_run.font.size = Pt(1)
# ---------------------------------------------------------------------------
# Importer helpers
# ---------------------------------------------------------------------------
def is_diagram_source_marker(text: str) -> bool:
"""Return True if *text* is a diagram source-intent marker."""
return text.strip().startswith(DIAGRAM_SOURCE_MARKER_PREFIX)
def parse_diagram_source_marker(text: str) -> tuple[str, str] | None:
"""Parse a diagram source-intent marker into (diagram_type, source).
Returns None if the text is not a valid marker.
"""
stripped = text.strip()
if not stripped.startswith(DIAGRAM_SOURCE_MARKER_PREFIX):
return None
rest = stripped[len(DIAGRAM_SOURCE_MARKER_PREFIX):]
# Format: "type\nsource..."
if "\n" in rest:
diagram_type, source = rest.split("\n", 1)
return diagram_type.strip(), source
return rest.strip(), ""
def reconstruct_diagram_md(diagram_type: str, source: str) -> str:
"""Reconstruct a fenced code block from diagram type and source."""
return f"```{diagram_type}\n{source}\n```"

View File

@@ -5,6 +5,8 @@ from __future__ import annotations
import re
from dataclasses import dataclass, field
from markidocx.errors import OutputState
HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
LIST_ITEM_RE = re.compile(r"^(\s*[-*+]|\s*\d+\.)\s+(.+)$", re.MULTILINE)
TABLE_ROW_RE = re.compile(r"^\|.+\|$", re.MULTILINE)
@@ -19,6 +21,7 @@ class DriftReport:
degraded: list[str] = field(default_factory=list)
broken: list[str] = field(default_factory=list)
unsupported: list[str] = field(default_factory=list)
output_state: OutputState = OutputState.FINAL
def compare(original: str, reimported: str) -> DriftReport:
@@ -76,13 +79,29 @@ def compare(original: str, reimported: str) -> DriftReport:
else:
degraded.append(f"link:lost {link[:40]}")
# --- Cross-references (FR-531, FR-540) ---
_compare_xrefs(original, reimported, preserved, degraded, broken)
# --- Figures (FR-532, FR-541) ---
_compare_figures(original, reimported, preserved, degraded, broken)
# --- Citations & Bibliography (FR-535, FR-542) ---
from markidocx.bibliography import compare_citations
compare_citations(original, reimported, preserved, degraded, broken)
has_drift = bool(degraded or broken)
output_state = (
OutputState.FINAL if not has_drift
else (OutputState.DEGRADED if not broken else OutputState.PARTIAL)
)
return DriftReport(
has_drift=has_drift,
preserved=preserved,
degraded=degraded,
broken=broken,
unsupported=unsupported,
output_state=output_state,
)
@@ -104,6 +123,64 @@ def _count_tables(text: str) -> int:
return count
def _compare_figures(
original: str,
reimported: str,
preserved: list[str],
degraded: list[str],
broken: list[str],
) -> None:
"""Compare figure labels and captions (FR-532, FR-541)."""
from markidocx.figures import extract_figure_captions, extract_figure_labels
orig_labels = extract_figure_labels(original)
reim_labels = extract_figure_labels(reimported)
for label in orig_labels:
if label in reim_labels:
preserved.append(f"figure-label:{label}")
else:
broken.append(f"figure-label:missing '{label}'")
orig_captions = extract_figure_captions(original)
reim_captions = extract_figure_captions(reimported)
orig_set = set(orig_captions)
reim_set = set(reim_captions)
for caption in orig_set:
if caption in reim_set:
preserved.append(f"figure-caption:{caption[:40]}")
else:
degraded.append(f"figure-caption:lost '{caption[:40]}'")
def _compare_xrefs(
original: str,
reimported: str,
preserved: list[str],
degraded: list[str],
broken: list[str],
) -> None:
"""Compare cross-reference anchors and links (FR-531, FR-540)."""
from markidocx.xref import extract_anchors, extract_xref_links
orig_anchors = extract_anchors(original)
reim_anchors = extract_anchors(reimported)
for anchor in orig_anchors:
if anchor in reim_anchors:
preserved.append(f"xref-anchor:{anchor}")
else:
broken.append(f"xref-anchor:missing '{anchor}'")
orig_xrefs = extract_xref_links(original)
reim_xrefs = extract_xref_links(reimported)
for link_text, anchor in orig_xrefs:
if (link_text, anchor) in reim_xrefs:
preserved.append(f"xref-link:[{link_text}][{anchor}]")
elif anchor not in reim_anchors:
broken.append(f"xref-link:broken-target [{link_text}][{anchor}]")
else:
degraded.append(f"xref-link:degraded [{link_text}][{anchor}]")
def _compare_sets(
kind: str,
orig: list[str],

80
src/markidocx/errors.py Normal file
View File

@@ -0,0 +1,80 @@
"""Structured error and warning types for markidocx (FR-12011210)."""
from __future__ import annotations
from dataclasses import dataclass
from enum import StrEnum
from typing import Any
class Severity(StrEnum):
INFO = "info"
WARNING = "warning"
ERROR = "error"
class OutputState(StrEnum):
"""Lifecycle state of a build/import/workflow result (FR-1210)."""
FINAL = "final"
PARTIAL = "partial"
FALLBACK = "fallback"
DEGRADED = "degraded"
UNRESOLVED = "unresolved"
@dataclass
class WarningRecord:
"""Structured warning record (FR-1208).
severity: info | warning | error
reason: FR-code-aligned description
construct: the token/element that triggered the warning
"""
severity: str
reason: str
construct: str = ""
def to_dict(self) -> dict[str, Any]:
return {
"severity": self.severity,
"reason": self.reason,
"construct": self.construct,
}
def __str__(self) -> str:
if self.construct:
return f"[{self.severity}] {self.reason}: {self.construct}"
return f"[{self.severity}] {self.reason}"
@dataclass
class FailureRecord:
"""Structured failure record (FR-1209).
severity: info | warning | error
reason: FR-code-aligned description
construct: the element that caused the failure
"""
severity: str
reason: str
construct: str = ""
def to_dict(self) -> dict[str, Any]:
return {
"severity": self.severity,
"reason": self.reason,
"construct": self.construct,
}
def __str__(self) -> str:
if self.construct:
return f"[{self.severity}] {self.reason}: {self.construct}"
return f"[{self.severity}] {self.reason}"
def warning_records_to_strings(records: list[WarningRecord]) -> list[str]:
"""Convert a list of WarningRecords to plain strings (backward compat helper)."""
return [str(r) for r in records]

147
src/markidocx/figures.py Normal file
View File

@@ -0,0 +1,147 @@
"""Numbered figure support for LEVEL3 markidocx (FR-532, FR-541).
Handles round-trip of captioned numbered figures between Markdown and DOCX.
Markdown syntax:
![Caption text](path/to/image.png){#fig:label}
DOCX representation:
[image paragraph or placeholder]
[caption paragraph: "Figure N — Caption text"]
(with alt-text marker: "figure-source:path/to/image.png#fig:label")
"""
from __future__ import annotations
import re
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from docx.document import Document as DocxDocument
# Markdown figure pattern: ![Caption](path){#fig:label}
FIGURE_RE = re.compile(
r"^!\[([^\]]*)\]\(([^)]+)\)\{#(fig:[\w:-]+)\}$",
re.MULTILINE,
)
# Caption paragraph pattern in imported DOCX
CAPTION_RE = re.compile(r"^Figure\s+(\d+)\s+[—\-]\s+(.+)$")
# Alt-text marker embedded in images to preserve source intent (FR-534)
ALT_TEXT_MARKER_PREFIX = "figure-source:"
def is_figure_paragraph(text: str) -> bool:
"""Return True if *text* is a standalone figure declaration."""
return bool(FIGURE_RE.match(text.strip()))
def parse_figure(text: str) -> tuple[str, str, str] | None:
"""Parse a figure declaration.
Returns (caption, path, label) or None.
"""
m = FIGURE_RE.match(text.strip())
if not m:
return None
return m.group(1), m.group(2), m.group(3)
def render_figure(
doc: DocxDocument,
caption: str,
path: str,
label: str,
figure_number: int,
) -> None:
"""Render a figure declaration into *doc* (FR-532).
Adds:
1. A paragraph with alt-text marker (image placeholder — actual embedding
requires the file to exist and is omitted here for portability).
2. A caption paragraph: "Figure N — Caption"
"""
# Alt-text marker so importer can reconstruct the figure (FR-534)
alt_marker = f"{ALT_TEXT_MARKER_PREFIX}{path}#{label}"
# Image placeholder paragraph with alt-text marker as text
placeholder = doc.add_paragraph(style="Normal")
run = placeholder.add_run(f"[Figure: {path}]")
# Store source-intent in the run's text (alt-text equivalent for round-trip)
run.italic = True
# Add DOCX comment/marker paragraph with the source-intent data
marker_para = doc.add_paragraph(style="Normal")
marker_run = marker_para.add_run(alt_marker)
marker_run.font.size = None # inherit
# Hide the marker by making it very small (conceptual; keeps round-trip info)
from docx.shared import Pt
marker_run.font.size = Pt(1)
marker_run.font.color.rgb = None # default color
# Caption paragraph
caption_para = doc.add_paragraph(style="Normal")
caption_para.add_run(f"Figure {figure_number}{caption}")
def extract_figures_from_md(text: str) -> list[tuple[str, str, str]]:
"""Extract all figure declarations from Markdown text.
Returns list of (caption, path, label).
"""
return [(m.group(1), m.group(2), m.group(3)) for m in FIGURE_RE.finditer(text)]
# ---------------------------------------------------------------------------
# Importer helpers
# ---------------------------------------------------------------------------
def is_caption_paragraph(text: str) -> bool:
"""Return True if *text* looks like a figure caption."""
return bool(CAPTION_RE.match(text.strip()))
def is_alt_text_marker(text: str) -> bool:
"""Return True if *text* is a figure-source alt-text marker."""
return text.strip().startswith(ALT_TEXT_MARKER_PREFIX)
def parse_alt_text_marker(text: str) -> tuple[str, str] | None:
"""Parse a figure-source marker into (path, label).
Returns None if the text is not a valid marker.
"""
stripped = text.strip()
if not stripped.startswith(ALT_TEXT_MARKER_PREFIX):
return None
rest = stripped[len(ALT_TEXT_MARKER_PREFIX):]
if "#" in rest:
path, label = rest.rsplit("#", 1)
return path, label
return rest, ""
def reconstruct_figure_md(caption: str, path: str, label: str) -> str:
"""Reconstruct a Markdown figure declaration from its parts."""
return f"![{caption}]({path}){{#{label}}}"
# ---------------------------------------------------------------------------
# Differ helpers
# ---------------------------------------------------------------------------
FIGURE_LABEL_RE = re.compile(r"\{#(fig:[\w:-]+)\}")
FIGURE_CAPTION_MD_RE = re.compile(r"!\[([^\]]*)\]\([^)]+\)\{#fig:[\w:-]+\}")
def extract_figure_labels(text: str) -> set[str]:
"""Extract {#fig:label} declarations from Markdown text."""
return set(FIGURE_LABEL_RE.findall(text))
def extract_figure_captions(text: str) -> list[str]:
"""Extract captions from figure declarations in Markdown text."""
return [m.group(1) for m in FIGURE_CAPTION_MD_RE.finditer(text)]

View File

@@ -11,6 +11,7 @@ from docx.document import Document as DocxDocument
from docx.table import Table
from docx.text.paragraph import Paragraph
from markidocx.errors import OutputState, Severity, WarningRecord
from markidocx.manifest import Manifest
HEADING_STYLE_RE = re.compile(r"^Heading (\d+)$", re.IGNORECASE)
@@ -23,7 +24,13 @@ class ImportResult:
success: bool
output_files: list[Path]
mapping_status: str # "redistributed" | "merged" | "failed"
warnings: list[str] = field(default_factory=list)
warning_records: list[WarningRecord] = field(default_factory=list)
output_state: OutputState = OutputState.FINAL
@property
def warnings(self) -> list[str]:
"""Backward-compatible string view of warning_records."""
return [str(w) for w in self.warning_records]
def import_document(manifest: Manifest, docx_path: Path) -> ImportResult:
@@ -33,14 +40,21 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult:
content is redistributed to the original files. Otherwise a single
merged file is produced.
"""
warnings: list[str] = []
warning_records: list[WarningRecord] = []
if not docx_path.exists():
return ImportResult(
success=False,
output_files=[],
mapping_status="failed",
warnings=[f"DOCX file not found: {docx_path}"],
warning_records=[
WarningRecord(
severity=Severity.ERROR,
reason="docx-not-found",
construct=str(docx_path),
)
],
output_state=OutputState.UNRESOLVED,
)
try:
@@ -50,10 +64,17 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult:
success=False,
output_files=[],
mapping_status="failed",
warnings=[f"Could not open DOCX: {exc}"],
warning_records=[
WarningRecord(
severity=Severity.ERROR,
reason="docx-open-failed",
construct=str(exc),
)
],
output_state=OutputState.UNRESOLVED,
)
md_text = _docx_to_markdown(doc, warnings)
md_text = _docx_to_markdown(doc, warning_records)
manifest.output_dir.mkdir(parents=True, exist_ok=True)
@@ -65,7 +86,8 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult:
success=True,
output_files=[out_path],
mapping_status="redistributed",
warnings=warnings,
warning_records=warning_records,
output_state=OutputState.FINAL,
)
# Multi-file: attempt redistribution by H1 boundary
@@ -79,13 +101,20 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult:
success=True,
output_files=output_files,
mapping_status="redistributed",
warnings=warnings,
warning_records=warning_records,
output_state=OutputState.FINAL,
)
# Fallback: merged single output (FR-406)
warnings.append(
f"Could not redistribute to {len(manifest.sources)} source files "
f"(found {len(sections)} H1 sections); writing merged output"
# Fallback: merged single output (FR-406, FR-1207)
warning_records.append(
WarningRecord(
severity=Severity.WARNING,
reason="fallback",
construct=(
f"could not redistribute to {len(manifest.sources)} source files "
f"(found {len(sections)} H1 sections); writing merged output"
),
)
)
merged_path = manifest.output_dir / "imported_merged.md"
merged_path.write_text(md_text, encoding="utf-8")
@@ -93,7 +122,8 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult:
success=True,
output_files=[merged_path],
mapping_status="merged",
warnings=warnings,
warning_records=warning_records,
output_state=OutputState.FALLBACK,
)
@@ -101,17 +131,95 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult:
# DOCX → Markdown conversion
# ---------------------------------------------------------------------------
def _docx_to_markdown(doc: DocxDocument, warnings: list[str]) -> str:
def _docx_to_markdown(doc: DocxDocument, warning_records: list[WarningRecord]) -> str:
"""Convert a python-docx Document to a Markdown string."""
from markidocx.bibliography import (
is_bibliography_entry,
is_bibliography_marker,
restore_citations_in_text,
)
from markidocx.diagrams import (
is_diagram_source_marker,
parse_diagram_source_marker,
reconstruct_diagram_md,
)
from markidocx.figures import (
CAPTION_RE,
is_alt_text_marker,
parse_alt_text_marker,
reconstruct_figure_md,
)
lines: list[str] = []
# Walk python-docx's block-level items
for block in _iter_blocks(doc):
blocks = list(_iter_blocks(doc))
idx = 0
while idx < len(blocks):
block = blocks[idx]
if isinstance(block, Paragraph):
md = _paragraph_to_md(block, warnings)
text = block.text.strip()
# Detect diagram source-intent marker (tiny font) → restore fenced block (FR-534)
if is_diagram_source_marker(text):
parsed = parse_diagram_source_marker(text)
if parsed:
diagram_type, source = parsed
from markidocx.diagrams import reconstruct_diagram_md
lines.append(reconstruct_diagram_md(diagram_type, source))
idx += 1
continue
# Detect alt-text marker (figure source intent) — skip it; consumed by caption
if is_alt_text_marker(text):
caption_text = ""
path = ""
label = ""
marker_parsed = parse_alt_text_marker(text)
if marker_parsed:
path, label = marker_parsed
if idx + 1 < len(blocks) and isinstance(blocks[idx + 1], Paragraph):
next_text = blocks[idx + 1].text.strip()
cm = CAPTION_RE.match(next_text)
if cm:
caption_text = cm.group(2)
idx += 1 # consume caption paragraph
if caption_text:
lines.append(reconstruct_figure_md(caption_text, path, label))
idx += 1
continue
# Detect placeholder + alt-text marker pattern: "[Figure: path]"
if text.startswith("[Figure:") and text.endswith("]"):
idx += 1
continue # skip placeholder; handled via alt-text marker
# Detect bibliography section marker (tiny invisible paragraph)
if is_bibliography_marker(text):
idx += 1
continue # skip; section already started by heading
# Detect bibliography reference entry ([@key]: ...) — already in correct format
if is_bibliography_entry(text):
lines.append(text)
idx += 1
continue
md = _paragraph_to_md(block, warning_records)
if md is not None:
lines.append(md)
elif isinstance(block, Table):
lines.append(_table_to_md(block))
idx += 1
# Bibliography entries are already inline after heading; no extra work needed
result_text = "\n\n".join(line for line in lines if line is not None)
# Restore citations in the text ([@key] markers)
result_text = restore_citations_in_text(result_text)
return result_text
return "\n\n".join(line for line in lines if line is not None)
@@ -128,7 +236,7 @@ def _iter_blocks(doc: DocxDocument):
yield Table(child, doc)
def _paragraph_to_md(para: Paragraph, warnings: list[str]) -> str | None:
def _paragraph_to_md(para: Paragraph, warning_records: list[WarningRecord]) -> str | None:
"""Convert a paragraph to a Markdown line."""
style_name = para.style.name if para.style else "Normal"
text = para.text.strip()
@@ -137,7 +245,14 @@ def _paragraph_to_md(para: Paragraph, warnings: list[str]) -> str | None:
m = HEADING_STYLE_RE.match(style_name)
if m:
level = int(m.group(1))
return f"{'#' * level} {text}"
# Check for bookmarks → restore {#anchor} labels (FR-531)
from markidocx.xref import extract_bookmarks_from_paragraph
bookmarks = extract_bookmarks_from_paragraph(para)
anchor_suffix = ""
if bookmarks:
anchor_suffix = " " + " ".join(f"{{#{b}}}" for b in bookmarks)
return f"{'#' * level} {text}{anchor_suffix}"
# Lists
if LIST_BULLET_RE.match(style_name):
@@ -145,13 +260,33 @@ def _paragraph_to_md(para: Paragraph, warnings: list[str]) -> str | None:
if LIST_NUMBER_RE.match(style_name):
return f"1. {text}"
# Normal text — preserve inline markup
# Normal text — check for internal hyperlinks (cross-refs) → [text][anchor]
from markidocx.xref import extract_internal_hyperlinks_from_paragraph
internal_links = extract_internal_hyperlinks_from_paragraph(para)
if internal_links:
return _runs_to_md_with_xrefs(para, internal_links)
if not text:
return None
return _runs_to_md(para)
def _runs_to_md_with_xrefs(
para: Paragraph, internal_links: list[tuple[str, str]]
) -> str:
"""Convert paragraph with internal hyperlinks to Markdown with [text][anchor].
para.text includes text from nested hyperlink elements, so we use it as
the base and replace each hyperlink text with [text][anchor] syntax.
"""
result = para.text
for link_text, anchor in internal_links:
result = result.replace(link_text, f"[{link_text}][{anchor}]", 1)
return result
def _runs_to_md(para: Paragraph) -> str:
"""Convert paragraph runs to Markdown with inline formatting."""
parts: list[str] = []

83
src/markidocx/level3.py Normal file
View File

@@ -0,0 +1,83 @@
"""LEVEL3 feature gating, processor-dependency disclosure, and support detection (FR-537539)."""
from __future__ import annotations
import shutil
from dataclasses import dataclass, field
# Diagram renderers recognised by LEVEL3 auto-diagram support
_DIAGRAM_TOOLS: dict[str, str] = {
"mmdc": "Mermaid CLI (mermaid diagrams)",
"dot": "Graphviz dot (graphviz diagrams)",
"plantuml": "PlantUML (plantuml diagrams)",
}
@dataclass
class ProcessorDependency:
"""An external tool required for a LEVEL3 construct (FR-538)."""
name: str
description: str
available: bool
@dataclass
class Level3Support:
"""Summary of LEVEL3 processing capability on the current host (FR-537, FR-538).
available: True if *any* LEVEL3 processing is possible (always True —
core features like cross-refs / figures / bibliography work
without external tools).
dependencies: per-tool availability for diagram rendering.
partial: True when some LEVEL3 features are unavailable due to missing tools.
missing_coverage: human-readable list of unavailable feature areas.
"""
available: bool = True
dependencies: list[ProcessorDependency] = field(default_factory=list)
partial: bool = False
missing_coverage: list[str] = field(default_factory=list)
def check_level3_support() -> Level3Support:
"""Detect external tool availability and compute Level3Support (FR-537, FR-538).
Core LEVEL3 features (cross-refs, figures, bibliography) are always available.
Diagram rendering requires external tools (mmdc / dot / plantuml).
"""
deps: list[ProcessorDependency] = []
for cmd, description in _DIAGRAM_TOOLS.items():
available = shutil.which(cmd) is not None
deps.append(ProcessorDependency(name=cmd, description=description, available=available))
diagram_available = any(d.available for d in deps)
missing: list[str] = []
if not diagram_available:
missing.append("auto-diagrams (no renderer: mmdc/dot/plantuml not found)")
return Level3Support(
available=True,
dependencies=deps,
partial=bool(missing),
missing_coverage=missing,
)
def capabilities_entry() -> dict:
"""Return a capabilities dict fragment for LEVEL3 (FR-537)."""
support = check_level3_support()
return {
"level": "level3",
"available": support.available,
"partial": support.partial,
"missing_coverage": support.missing_coverage,
"dependencies": [
{
"name": d.name,
"description": d.description,
"available": d.available,
}
for d in support.dependencies
],
}

View File

@@ -68,6 +68,8 @@ def validate_project(manifest_yaml: str) -> dict[str, Any]:
except Exception:
(tmp_path / "dist").mkdir(exist_ok=True)
try:
from markidocx.level3 import capabilities_entry as level3_capabilities
m = load_manifest(mp)
return {
"status": "ok",
@@ -79,6 +81,7 @@ def validate_project(manifest_yaml: str) -> dict[str, Any]:
"context": {
"supported_families": sorted(SUPPORTED_FAMILIES),
"supported_feature_levels": [e.value for e in FeatureLevel],
"level3": level3_capabilities(),
},
}
except ManifestError as exc:
@@ -123,15 +126,24 @@ def build(manifest_yaml: str, sources: list[dict[str, str]]) -> dict[str, Any]:
result = build_document(m)
if result.success:
docx_b64 = base64.b64encode(Path(result.output_path).read_bytes()).decode()
return {
out: dict[str, Any] = {
"status": "ok",
"docx_base64": docx_b64,
"family": result.family,
"feature_level": result.feature_level,
"warnings": result.warnings,
"output_state": result.output_state,
"warnings": [w.to_dict() for w in result.warning_records],
"errors": [],
}
return {"status": "error", "errors": result.errors, "warnings": result.warnings}
if result.partial_level3:
out["partial_level3"] = True
out["missing_coverage"] = result.missing_coverage
return out
return {
"status": "error",
"errors": result.errors,
"warnings": [w.to_dict() for w in result.warning_records],
}
@mcp.tool()
@@ -182,10 +194,15 @@ def import_docx(manifest_yaml: str, docx_base64: str) -> dict[str, Any]:
"status": "ok",
"files": files_md,
"mapping_status": result.mapping_status,
"warnings": result.warnings,
"output_state": result.output_state,
"warnings": [w.to_dict() for w in result.warning_records],
"errors": [],
}
return {"status": "error", "errors": ["Import failed"], "warnings": result.warnings}
return {
"status": "error",
"errors": ["Import failed"],
"warnings": [w.to_dict() for w in result.warning_records],
}
@mcp.tool()
@@ -329,14 +346,17 @@ def get_evidence(run_id: str) -> dict[str, Any]:
@mcp.resource("markidocx://capabilities")
def resource_capabilities() -> str:
"""Capabilities: supported feature levels and families."""
"""Capabilities: supported feature levels and families (FR-537)."""
import json
from markidocx.level3 import capabilities_entry as level3_capabilities
return json.dumps(
{
"version": __version__,
"feature_levels": [e.value for e in FeatureLevel],
"families": sorted(SUPPORTED_FAMILIES),
"level3": level3_capabilities(),
}
)

View File

@@ -23,14 +23,14 @@ from markidocx.templates import FamilyRegistry
class ResponseEnvelope(BaseModel):
status: str
outputs: Any = None
warnings: list[str] = []
warnings: list[Any] = [] # list[WarningRecord.to_dict()] or list[str] (FR-1208)
errors: list[str] = []
context: dict[str, Any] = {}
def _ok(
outputs: Any = None,
warnings: list[str] | None = None,
warnings: list[Any] | None = None,
context: dict[str, Any] | None = None,
) -> ResponseEnvelope:
return ResponseEnvelope(
@@ -44,7 +44,7 @@ def _ok(
def _error(
errors: list[str],
warnings: list[str] | None = None,
warnings: list[Any] | None = None,
context: dict[str, Any] | None = None,
) -> ResponseEnvelope:
return ResponseEnvelope(
@@ -158,11 +158,14 @@ def create_app() -> FastAPI:
@app.get("/capabilities", response_model=ResponseEnvelope)
def capabilities() -> ResponseEnvelope:
"""Capability inspection — feature levels and families (FR-909)."""
"""Capability inspection — feature levels and families (FR-909, FR-537)."""
from markidocx.level3 import capabilities_entry as level3_capabilities
return _ok(
outputs={
"feature_levels": [e.value for e in FeatureLevel],
"families": sorted(SUPPORTED_FAMILIES),
"level3": level3_capabilities(),
},
context={"version": __version__},
)
@@ -227,17 +230,29 @@ def create_app() -> FastAPI:
**req.context,
"family": result.family,
"feature_level": result.feature_level,
"output_state": result.output_state,
}
if result.success:
docx_b64 = base64.b64encode(Path(result.output_path).read_bytes()).decode()
outputs: dict[str, Any] = {
"docx_base64": docx_b64,
"output_path": str(result.output_path),
}
if result.partial_level3:
outputs["partial_level3"] = True
outputs["missing_coverage"] = result.missing_coverage
return ResponseEnvelope(
status="ok",
outputs={"docx_base64": docx_b64, "output_path": str(result.output_path)},
warnings=result.warnings,
outputs=outputs,
warnings=[w.to_dict() for w in result.warning_records],
errors=[],
context=ctx,
)
return _error(errors=result.errors, warnings=result.warnings, context=ctx)
return _error(
errors=result.errors,
warnings=[w.to_dict() for w in result.warning_records],
context=ctx,
)
@app.post("/import", response_model=ResponseEnvelope)
def import_docx(req: ImportRequest) -> ResponseEnvelope:
@@ -255,7 +270,7 @@ def create_app() -> FastAPI:
except ManifestError as exc:
return _error(errors=[str(exc)], context=req.context)
result = import_document(m, docx_path)
ctx = {**req.context}
ctx = {**req.context, "output_state": result.output_state}
if result.success:
import contextlib
@@ -266,14 +281,14 @@ def create_app() -> FastAPI:
return ResponseEnvelope(
status="ok",
outputs={"files": files_md, "mapping_status": result.mapping_status},
warnings=result.warnings,
warnings=[w.to_dict() for w in result.warning_records],
errors=[],
context=ctx,
)
return ResponseEnvelope(
status="error",
outputs=None,
warnings=result.warnings,
warnings=[w.to_dict() for w in result.warning_records],
errors=["Import failed"],
context=ctx,
)

159
src/markidocx/xref.py Normal file
View File

@@ -0,0 +1,159 @@
"""Cross-reference support for LEVEL3 markidocx (FR-531, FR-540).
Handles the round-trip of heading anchors ({#anchor}) and cross-reference
links ([text][anchor]) between Markdown and DOCX bookmarks/hyperlinks.
"""
from __future__ import annotations
import re
from typing import TYPE_CHECKING
from lxml import etree
if TYPE_CHECKING:
from docx.text.paragraph import Paragraph as DocxParagraph
# Markdown patterns
ANCHOR_LABEL_RE = re.compile(r"\s*\{#([\w-]+)\}\s*$")
XREF_LINK_RE = re.compile(r"\[([^\]]+)\]\[([\w-]+)\]")
# DOCX XML namespaces
_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
_R = "http://schemas.openxmlformats.org/officeDocument/2006/relationships"
def extract_anchor_from_heading(text: str) -> tuple[str, str | None]:
"""Strip `{#anchor}` suffix from heading text.
Returns (clean_text, anchor_name) — anchor_name is None if no anchor present.
"""
m = ANCHOR_LABEL_RE.search(text)
if m:
anchor = m.group(1)
clean = text[: m.start()]
return clean, anchor
return text, None
def add_bookmark_to_paragraph(para: DocxParagraph, bookmark_name: str, bookmark_id: int) -> None:
"""Insert a bookmark start/end pair into a paragraph's XML (FR-531)."""
p_elem = para._p # lxml element
# <w:bookmarkStart w:id="N" w:name="anchor"/>
bm_start = etree.SubElement(p_elem, f"{{{_W}}}bookmarkStart")
bm_start.set(f"{{{_W}}}id", str(bookmark_id))
bm_start.set(f"{{{_W}}}name", bookmark_name)
# <w:bookmarkEnd w:id="N"/>
bm_end = etree.SubElement(p_elem, f"{{{_W}}}bookmarkEnd")
bm_end.set(f"{{{_W}}}id", str(bookmark_id))
def add_internal_hyperlink(para: DocxParagraph, text: str, anchor: str) -> None:
"""Add an internal hyperlink run pointing to a bookmark anchor (FR-531).
Inserts a <w:hyperlink w:anchor="anchor"> element with a run.
"""
p_elem = para._p
hyperlink = etree.SubElement(p_elem, f"{{{_W}}}hyperlink")
hyperlink.set(f"{{{_W}}}anchor", anchor)
run = etree.SubElement(hyperlink, f"{{{_W}}}r")
rpr = etree.SubElement(run, f"{{{_W}}}rPr")
style = etree.SubElement(rpr, f"{{{_W}}}rStyle")
style.set(f"{{{_W}}}val", "Hyperlink")
t = etree.SubElement(run, f"{{{_W}}}t")
t.text = text
if text and (text[0] == " " or text[-1] == " "):
t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
def render_paragraph_with_xrefs(
para: DocxParagraph,
text: str,
known_anchors: set[str],
) -> None:
"""Render paragraph text, converting [text][anchor] to internal hyperlinks (FR-531).
Falls back to plain text for references to unknown anchors.
"""
last_end = 0
for m in XREF_LINK_RE.finditer(text):
link_text = m.group(1)
anchor = m.group(2)
# Add plain text before this match
if m.start() > last_end:
para.add_run(text[last_end : m.start()])
if anchor in known_anchors:
add_internal_hyperlink(para, link_text, anchor)
else:
# Unknown anchor — render as plain text with a note
para.add_run(f"{link_text} [→{anchor}]")
last_end = m.end()
# Remaining text
if last_end < len(text):
para.add_run(text[last_end:])
def has_xref_links(text: str) -> bool:
"""Return True if *text* contains any [text][anchor] patterns."""
return bool(XREF_LINK_RE.search(text))
# ---------------------------------------------------------------------------
# Importer helpers
# ---------------------------------------------------------------------------
def extract_bookmarks_from_paragraph(para: DocxParagraph) -> list[str]:
"""Return bookmark names declared in the paragraph's XML."""
bookmarks: list[str] = []
for elem in para._p.iter(f"{{{_W}}}bookmarkStart"):
name = elem.get(f"{{{_W}}}name", "")
# Ignore Word-internal bookmarks (e.g. _GoBack)
if name and not name.startswith("_"):
bookmarks.append(name)
return bookmarks
def extract_internal_hyperlinks_from_paragraph(
para: DocxParagraph,
) -> list[tuple[str, str]]:
"""Return (text, anchor) pairs for internal hyperlinks in *para*.
Internal = <w:hyperlink w:anchor="..."> (no relationship ID).
"""
links: list[tuple[str, str]] = []
for hl in para._p.iter(f"{{{_W}}}hyperlink"):
anchor = hl.get(f"{{{_W}}}anchor")
# Only internal bookmark hyperlinks (no r:id)
if anchor and not hl.get(f"{{{_R}}}id"):
# Collect run text
parts: list[str] = []
for t_elem in hl.iter(f"{{{_W}}}t"):
if t_elem.text:
parts.append(t_elem.text)
text = "".join(parts)
if text:
links.append((text, anchor))
return links
# ---------------------------------------------------------------------------
# Differ helpers
# ---------------------------------------------------------------------------
XREF_ANCHOR_RE = re.compile(r"\{#([\w-]+)\}")
XREF_LINK_PATTERN = re.compile(r"\[([^\]]+)\]\[([\w-]+)\]")
def extract_anchors(text: str) -> set[str]:
"""Extract all {#anchor} declarations from Markdown text."""
return set(XREF_ANCHOR_RE.findall(text))
def extract_xref_links(text: str) -> set[tuple[str, str]]:
"""Extract all (text, anchor) cross-ref link pairs from Markdown text."""
return {(m.group(1), m.group(2)) for m in XREF_LINK_PATTERN.finditer(text)}