diff --git a/CLAUDE.md b/CLAUDE.md index f9f6848..b228227 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -70,16 +70,22 @@ evidence artefacts |--------|--------|-------------| | `cli.py` | implemented — all commands wired (`build`, `import`, `compare`, `validate`, `serve`, `workflow`, `mcp`, `template`) | all | | `manifest.py` | implemented | FR-100 | -| `builder.py` | implemented | FR-200 | -| `importer.py` | implemented | FR-300/400 | -| `differ.py` | implemented | FR-700 | +| `builder.py` | implemented — LEVEL1 + LEVEL3 (xrefs, figures, diagrams, citations) | FR-200, FR-531–539 | +| `importer.py` | implemented — LEVEL1 + LEVEL3 round-trip | FR-300/400, FR-531–536 | +| `differ.py` | implemented — LEVEL1 + LEVEL3 drift detection | FR-700, FR-540–542 | | `templates.py` | implemented | FR-600 | | `evidence.py` | implemented | FR-1400 | | `workflows.py` | implemented (`single-file-roundtrip`, `multi-file-roundtrip`, `release-regression`, `family-switch-build`) | FR-1300 | -| `rest.py` | implemented — FastAPI app, all endpoints | FR-900 | -| `mcp_server.py` | implemented — FastMCP server, all tools and resources | FR-1000 | +| `rest.py` | implemented — FastAPI app, all endpoints; structured warning records | FR-900, FR-1208 | +| `mcp_server.py` | implemented — FastMCP server, all tools and resources; structured warnings | FR-1000, FR-1208 | +| `errors.py` | implemented — `WarningRecord`, `FailureRecord`, `OutputState` | FR-1201–1210 | +| `level3.py` | implemented — LEVEL3 support detection, capability disclosure | FR-537–539 | +| `xref.py` | implemented — cross-reference round-trip helpers | FR-531, FR-540 | +| `figures.py` | implemented — numbered figure round-trip helpers | FR-532, FR-541 | +| `diagrams.py` | implemented — auto-diagram source-only + renderer path | FR-533, FR-534 | +| `bibliography.py` | implemented — citation and references section round-trip | FR-535, FR-536, FR-542 | -`tests/conftest.py` provides shared fixtures (`tmp_project`, `SIMPLE_MANIFEST_YAML`, `SIMPLE_MARKDOWN`). WP-0001 and WP-0002 complete — 135 tests passing. All interfaces (CLI, REST, MCP) implemented and parity-tested. +`tests/conftest.py` provides shared fixtures. WP-0001, WP-0002, and WP-0003 complete — 259 tests passing. Full LEVEL1 + LEVEL3 feature coverage. All interfaces (CLI, REST, MCP) implemented and parity-tested. --- diff --git a/src/markidocx/bibliography.py b/src/markidocx/bibliography.py new file mode 100644 index 0000000..b3f159a --- /dev/null +++ b/src/markidocx/bibliography.py @@ -0,0 +1,208 @@ +"""Bibliography and citation support for LEVEL3 markidocx (FR-535, FR-536, FR-542). + +Handles the round-trip of inline citations and Bibliography/References sections +between Markdown and DOCX. + +Markdown syntax: + Inline citation: [@key] + References section: + ## References + - [@key]: Author. *Title*. Year. + +DOCX representation: + Inline: [key] (plain text marker) + References section: "References" heading + plain text entries + Source-intent markers embedded for importer restoration. +""" + +from __future__ import annotations + +import re + +# Markdown citation patterns +CITATION_RE = re.compile(r"\[@([\w:.-]+)\]") +CITATION_ENTRY_RE = re.compile(r"^-\s+\[@([\w:.-]+)\]:\s+(.+)$") +REFERENCES_HEADING_RE = re.compile(r"^#{1,3}\s+References\s*$", re.MULTILINE) + +# DOCX markers +CITATION_MARKER_PREFIX = "citation:" +REFERENCES_SECTION_MARKER = "references-section:" + + +def has_citations(text: str) -> bool: + """Return True if *text* contains inline citations.""" + return bool(CITATION_RE.search(text)) + + +def render_inline_citations(text: str) -> str: + """Replace [@key] markers with [key] for DOCX embedding. + + Returns the transformed text suitable for DOCX paragraph text. + """ + return CITATION_RE.sub(lambda m: f"[{m.group(1)}]", text) + + +def extract_citation_keys(text: str) -> list[str]: + """Extract all citation keys from *text*.""" + return CITATION_RE.findall(text) + + +def is_references_heading(text: str) -> bool: + """Return True if *text* is a References section heading.""" + return bool(REFERENCES_HEADING_RE.match(text.strip())) + + +def parse_reference_entry(text: str) -> tuple[str, str] | None: + """Parse a reference list entry. + + Returns (key, entry_text) or None. + """ + m = CITATION_ENTRY_RE.match(text.strip()) + if m: + return m.group(1), m.group(2) + return None + + +def extract_references_section(md_text: str) -> tuple[list[tuple[str, str]], str]: + """Extract the references section from Markdown text. + + Returns (entries, text_without_references_section). + entries: list of (key, entry_text) + """ + # Find the References heading + m = REFERENCES_HEADING_RE.search(md_text) + if not m: + return [], md_text + + refs_start = m.start() + entries: list[tuple[str, str]] = [] + + # Collect entries after the heading + rest = md_text[m.end():].strip() + for line in rest.split("\n"): + line = line.strip() + if not line: + continue + parsed = parse_reference_entry(line) + if parsed: + entries.append(parsed) + elif line.startswith("#"): + # New heading — stop collecting + break + + text_without = md_text[:refs_start].rstrip() + return entries, text_without + + +# --------------------------------------------------------------------------- +# Builder helpers +# --------------------------------------------------------------------------- + +BIBLIOGRAPHY_SECTION_HEADING = "References" +BIBLIOGRAPHY_MARKER = "bibliography-section-start" + + +def render_citation_text(text: str) -> str: + """Return citation text for DOCX embedding. + + [@key] is kept as-is in the DOCX paragraph text so the importer + can restore it without ambiguity. + """ + return text # [@key] → [@key] (no transformation needed) + + +def render_references_section(doc, entries: list[tuple[str, str]]) -> None: + """Add a References section to *doc* (FR-535). + + Args: + doc: python-docx Document + entries: list of (key, entry_text) + """ + # Section heading + try: + doc.add_heading(BIBLIOGRAPHY_SECTION_HEADING, level=2) + except Exception: + doc.add_paragraph(BIBLIOGRAPHY_SECTION_HEADING, style="Normal") + + # Bibliography marker so importer can identify the section + marker_para = doc.add_paragraph(style="Normal") + from docx.shared import Pt + + marker_run = marker_para.add_run(BIBLIOGRAPHY_MARKER) + marker_run.font.size = Pt(1) + + # Entries — keep [@key] format directly in DOCX text for round-trip fidelity + for key, entry_text in entries: + para = doc.add_paragraph(style="Normal") + para.add_run(f"- [@{key}]: {entry_text}") + + +# --------------------------------------------------------------------------- +# Importer helpers +# --------------------------------------------------------------------------- + +DOCX_CITATION_RE = re.compile(r"\[([^\]@]+)\](?!\[)") # [key] without @, not followed by [ +BIBLIOGRAPHY_MARKER_PARA_RE = re.compile(r"^bibliography-section-start$") +BIBLIOGRAPHY_ENTRY_RE = re.compile(r"^-\s+\[@([\w:.-]+)\]:\s+(.+)$") + + +def restore_citations_in_text(text: str) -> str: + """Return imported text with citations already in [@key] form (no-op). + + Since builder now embeds [@key] directly in DOCX, no restoration needed. + """ + return text + + +def is_bibliography_marker(text: str) -> bool: + return BIBLIOGRAPHY_MARKER_PARA_RE.match(text.strip()) is not None + + +def is_bibliography_entry(text: str) -> bool: + return bool(BIBLIOGRAPHY_ENTRY_RE.match(text.strip())) + + +# --------------------------------------------------------------------------- +# Differ helpers +# --------------------------------------------------------------------------- + + +def compare_citations( + original: str, + reimported: str, + preserved: list[str], + degraded: list[str], + broken: list[str], + warning_records: list | None = None, +) -> None: + """Compare citation markers and reference entries (FR-536, FR-542).""" + orig_keys = set(extract_citation_keys(original)) + reim_keys = set(extract_citation_keys(reimported)) + + for key in orig_keys: + if key in reim_keys: + preserved.append(f"citation:[@{key}]") + else: + broken.append(f"citation:missing '[@{key}]'") + if warning_records is not None: + from markidocx.errors import Severity, WarningRecord + + warning_records.append( + WarningRecord( + severity=Severity.WARNING, + reason="citation-ambiguity", + construct=f"@{key}", + ) + ) + + # References section + orig_entries, _ = extract_references_section(original) + reim_entries, _ = extract_references_section(reimported) + orig_ref_keys = {k for k, _ in orig_entries} + reim_ref_keys = {k for k, _ in reim_entries} + + for key in orig_ref_keys: + if key in reim_ref_keys: + preserved.append(f"reference-entry:{key}") + else: + degraded.append(f"reference-entry:lost '{key}'") diff --git a/src/markidocx/builder.py b/src/markidocx/builder.py index 922a82a..d8c614f 100644 --- a/src/markidocx/builder.py +++ b/src/markidocx/builder.py @@ -9,6 +9,7 @@ import mistune from docx.document import Document as DocxDocument from docx.shared import Pt, RGBColor +from markidocx.errors import OutputState, Severity, WarningRecord from markidocx.manifest import FeatureLevel, Manifest from markidocx.templates import FamilyRegistry @@ -19,8 +20,16 @@ class BuildResult: output_path: Path family: str feature_level: str - warnings: list[str] = field(default_factory=list) + warning_records: list[WarningRecord] = field(default_factory=list) errors: list[str] = field(default_factory=list) + output_state: OutputState = OutputState.FINAL + partial_level3: bool = False + missing_coverage: list[str] = field(default_factory=list) + + @property + def warnings(self) -> list[str]: + """Backward-compatible string view of warning_records.""" + return [str(w) for w in self.warning_records] def build_document(manifest: Manifest) -> BuildResult: @@ -28,8 +37,27 @@ def build_document(manifest: Manifest) -> BuildResult: Returns a BuildResult regardless of success/failure. """ - warnings: list[str] = [] + warning_records: list[WarningRecord] = [] errors: list[str] = [] + partial_level3 = False + missing_coverage: list[str] = [] + + # For LEVEL3 projects, check external dependencies (FR-538, FR-539) + if manifest.project.feature_level == FeatureLevel.LEVEL3: + from markidocx.level3 import check_level3_support + + support = check_level3_support() + if support.partial: + partial_level3 = True + missing_coverage = support.missing_coverage + for area in support.missing_coverage: + warning_records.append( + WarningRecord( + severity=Severity.WARNING, + reason="processor-dependency-unavailable", + construct=area, + ) + ) # Compose all source files into one Markdown string parts: list[str] = [] @@ -48,11 +76,18 @@ def build_document(manifest: Manifest) -> BuildResult: core_props.author = str(manifest.metadata["author"]) # Parse and render tokens into the document - unsupported: list[str] = [] - _render_markdown(doc, markdown_text, manifest.project.feature_level, warnings, unsupported) + _render_markdown( + doc, + markdown_text, + manifest.project.feature_level, + warning_records, + ) - for item in unsupported: - warnings.append(f"Unsupported construct skipped: {item}") + # Determine output state + has_warnings = bool(warning_records) + output_state = OutputState.PARTIAL if partial_level3 else ( + OutputState.FINAL if not has_warnings else OutputState.FINAL + ) # Ensure output dir exists manifest.output_dir.mkdir(parents=True, exist_ok=True) @@ -64,8 +99,11 @@ def build_document(manifest: Manifest) -> BuildResult: output_path=output_path, family=manifest.project.family, feature_level=manifest.project.feature_level.value, - warnings=warnings, + warning_records=warning_records, errors=errors, + output_state=output_state, + partial_level3=partial_level3, + missing_coverage=missing_coverage, ) @@ -77,13 +115,45 @@ def _render_markdown( doc: DocxDocument, text: str, feature_level: FeatureLevel, - warnings: list[str], - unsupported: list[str], + warning_records: list[WarningRecord], ) -> None: """Parse *text* as Markdown and append elements to *doc*.""" - tokens = _tokenise(text) + # For LEVEL3, extract references section before tokenising + ref_entries: list[tuple[str, str]] = [] + body_text = text + if feature_level == FeatureLevel.LEVEL3: + from markidocx.bibliography import extract_references_section + + ref_entries, body_text = extract_references_section(text) + + tokens = _tokenise(body_text) + + # Pre-compute known anchors for cross-ref validation (LEVEL3 only) + known_anchors: set[str] = set() + if feature_level == FeatureLevel.LEVEL3: + from markidocx.xref import extract_anchors + + known_anchors = extract_anchors(body_text) + + bookmark_counter = [0] # mutable int for nested calls + figure_counter = [0] # auto-incrementing figure number + for token in tokens: - _render_token(doc, token, feature_level, warnings, unsupported) + _render_token( + doc, + token, + feature_level, + warning_records, + known_anchors, + bookmark_counter, + figure_counter, + ) + + # Render references section at the end (LEVEL3 only) (FR-535) + if feature_level == FeatureLevel.LEVEL3 and ref_entries: + from markidocx.bibliography import render_references_section + + render_references_section(doc, ref_entries) def _tokenise(text: str) -> list[dict]: # type: ignore[type-arg] @@ -99,23 +169,74 @@ def _render_token( doc: DocxDocument, token: dict, feature_level: FeatureLevel, - warnings: list[str], - unsupported: list[str], + warning_records: list[WarningRecord], + known_anchors: set[str] | None = None, + bookmark_counter: list[int] | None = None, + figure_counter: list[int] | None = None, ) -> None: token_type = token.get("type", "") + if known_anchors is None: + known_anchors = set() + if bookmark_counter is None: + bookmark_counter = [0] + if figure_counter is None: + figure_counter = [0] if token_type == "heading": level = token.get("attrs", {}).get("level", 1) - text = _extract_text(token.get("children", [])) + raw_text = _extract_text(token.get("children", [])) + + if feature_level == FeatureLevel.LEVEL3: + from markidocx.xref import ( + add_bookmark_to_paragraph, + extract_anchor_from_heading, + ) + + clean_text, anchor = extract_anchor_from_heading(raw_text) + else: + clean_text, anchor = raw_text, None + try: - doc.add_heading(text, level=level) + para = doc.add_heading(clean_text, level=level) except Exception: - doc.add_paragraph(text, style="Normal") + para = doc.add_paragraph(clean_text, style="Normal") + + if anchor: + add_bookmark_to_paragraph(para, anchor, bookmark_counter[0]) + bookmark_counter[0] += 1 elif token_type == "paragraph": - text = _extract_text(token.get("children", [])) - para = doc.add_paragraph(style="Normal") - _add_inline_runs(para, token.get("children", [])) + raw_text = _extract_text_with_image_syntax(token.get("children", [])) + + if feature_level == FeatureLevel.LEVEL3: + from markidocx.figures import is_figure_paragraph, parse_figure + from markidocx.xref import has_xref_links, render_paragraph_with_xrefs + + if is_figure_paragraph(raw_text): + parsed = parse_figure(raw_text) + if parsed: + caption, path, label = parsed + figure_counter[0] += 1 + from markidocx.figures import render_figure + + render_figure(doc, caption, path, label, figure_counter[0]) + else: + para = doc.add_paragraph(style="Normal") + _add_inline_runs(para, token.get("children", [])) + elif has_xref_links(raw_text): + para = doc.add_paragraph(style="Normal") + render_paragraph_with_xrefs(para, raw_text, known_anchors) + else: + para = doc.add_paragraph(style="Normal") + from markidocx.bibliography import has_citations, render_citation_text + + if has_citations(raw_text): + para.add_run(render_citation_text(raw_text)) + else: + _add_inline_runs(para, token.get("children", [])) + else: + para = doc.add_paragraph(style="Normal") + _add_inline_runs(para, token.get("children", [])) elif token_type == "list": ordered = token.get("attrs", {}).get("ordered", False) @@ -135,10 +256,23 @@ def _render_token( elif token_type == "block_code": code = token.get("raw", "") - para = doc.add_paragraph(style="Normal") - run = para.add_run(code) - run.font.name = "Courier New" - run.font.size = Pt(9) + info = (token.get("attrs", {}) or {}).get("info", "") or "" + + if feature_level == FeatureLevel.LEVEL3: + from markidocx.diagrams import is_diagram_info, render_diagram_block + + if is_diagram_info(info): + render_diagram_block(doc, info.strip().lower(), code, warning_records) + else: + para = doc.add_paragraph(style="Normal") + run = para.add_run(code) + run.font.name = "Courier New" + run.font.size = Pt(9) + else: + para = doc.add_paragraph(style="Normal") + run = para.add_run(code) + run.font.name = "Courier New" + run.font.size = Pt(9) elif token_type == "block_quote": children = token.get("children", []) @@ -151,14 +285,27 @@ def _render_token( doc.add_paragraph("—" * 20, style="Normal") elif token_type in ("html_block", "raw_html"): - unsupported.append(f"html ({token_type})") + warning_records.append( + WarningRecord( + severity=Severity.WARNING, + reason="unsupported-construct", + construct=f"html ({token_type})", + ) + ) elif token_type == "blank_line": pass # ignore blank lines else: - # Unknown token — surface as unsupported (FR-508) - unsupported.append(token_type) + # Unknown token — surface as unsupported (FR-508, FR-1203) + if token_type: + warning_records.append( + WarningRecord( + severity=Severity.WARNING, + reason="unsupported-construct", + construct=token_type, + ) + ) def _render_table(doc: DocxDocument, token: dict) -> None: @@ -186,6 +333,26 @@ def _render_table(doc: DocxDocument, token: dict) -> None: run.bold = True +def _extract_text_with_image_syntax(children: list[dict]) -> str: + """Extract text from token children, reconstructing image MD syntax for figures.""" + parts: list[str] = [] + for child in children: + child_type = child.get("type", "") + if child_type == "image": + caption = _extract_text(child.get("children", [])) + url = child.get("attrs", {}).get("url", "") + parts.append(f"![{caption}]({url})") + elif child_type == "text": + parts.append(child.get("raw", "")) + elif child_type in ("strong", "emphasis", "codespan", "link"): + parts.append(_extract_text(child.get("children", []))) + elif child.get("raw"): + parts.append(child["raw"]) + elif child.get("children"): + parts.append(_extract_text_with_image_syntax(child["children"])) + return "".join(parts) + + def _extract_text(children: list[dict]) -> str: """Recursively extract plain text from a token children list.""" parts: list[str] = [] diff --git a/src/markidocx/diagrams.py b/src/markidocx/diagrams.py new file mode 100644 index 0000000..d03a846 --- /dev/null +++ b/src/markidocx/diagrams.py @@ -0,0 +1,190 @@ +"""Auto-diagram support for LEVEL3 markidocx (FR-533, FR-534). + +Handles fenced diagram source blocks (mermaid, graphviz, plantuml) in the +Markdown ↔ DOCX round trip. + +Source-intent preservation: + When a renderer is unavailable, diagram source is embedded as a verbatim + code block and a source-intent marker paragraph is added so the importer + can restore the fenced block. No source is silently discarded (FR-1205). +""" + +from __future__ import annotations + +import re +import shutil +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from docx.document import Document as DocxDocument + +# Diagram types recognised as LEVEL3 auto-diagram sources +DIAGRAM_TYPES: frozenset[str] = frozenset({"mermaid", "graphviz", "plantuml"}) + +# Renderer → CLI command mapping +_RENDERER_COMMANDS: dict[str, str] = { + "mermaid": "mmdc", + "graphviz": "dot", + "plantuml": "plantuml", +} + +# Marker prefix stored in DOCX paragraph to preserve source intent (FR-534) +DIAGRAM_SOURCE_MARKER_PREFIX = "diagram-source:" +DIAGRAM_SOURCE_MARKER_RE = re.compile( + r"^diagram-source:(\w+)\n(.*)", re.DOTALL +) + + +def is_diagram_info(info: str) -> bool: + """Return True if *info* is a recognised diagram type.""" + return (info or "").strip().lower() in DIAGRAM_TYPES + + +def check_renderer(diagram_type: str) -> bool: + """Return True if the required renderer for *diagram_type* is available.""" + cmd = _RENDERER_COMMANDS.get(diagram_type.lower()) + return bool(cmd and shutil.which(cmd)) + + +def render_diagram_block( + doc: DocxDocument, + diagram_type: str, + source: str, + warning_records: list, +) -> None: + """Render a diagram fenced block into *doc* (FR-533, FR-534). + + If a renderer is available → renders to PNG and embeds the image. + If unavailable → embeds source as verbatim code block + source-intent marker. + Never silently discards source (FR-1205). + """ + from docx.shared import Pt + + from markidocx.errors import Severity, WarningRecord + + renderer_available = check_renderer(diagram_type) + + if renderer_available: + _render_diagram_with_tool(doc, diagram_type, source, warning_records) + return + + # Renderer not available — emit warning (FR-538) and use source-only path + warning_records.append( + WarningRecord( + severity=Severity.WARNING, + reason="processor-dependency-unavailable", + construct=f"{diagram_type} (no renderer: {_RENDERER_COMMANDS.get(diagram_type, diagram_type)} not found)", + ) + ) + + # Verbatim code block (source preserved — FR-1205) + code_para = doc.add_paragraph(style="Normal") + run = code_para.add_run(f"```{diagram_type}\n{source}\n```") + run.font.name = "Courier New" + run.font.size = Pt(9) + + # Source-intent marker paragraph so importer can restore (FR-534) + marker_para = doc.add_paragraph(style="Normal") + marker_run = marker_para.add_run(f"{DIAGRAM_SOURCE_MARKER_PREFIX}{diagram_type}\n{source}") + marker_run.font.size = Pt(1) # make tiny — not for display + + +def _render_diagram_with_tool( + doc: DocxDocument, + diagram_type: str, + source: str, + warning_records: list, +) -> None: + """Attempt to render diagram source using an external tool and embed PNG.""" + import subprocess + import tempfile + from pathlib import Path + + from docx.shared import Inches, Pt + + from markidocx.errors import Severity, WarningRecord + + cmd = _RENDERER_COMMANDS[diagram_type] + try: + with tempfile.TemporaryDirectory() as tmp: + tmp_path = Path(tmp) + src_file = tmp_path / f"diagram.{diagram_type[:3]}" + png_file = tmp_path / "diagram.png" + src_file.write_text(source, encoding="utf-8") + + if diagram_type == "mermaid": + args = [cmd, "-i", str(src_file), "-o", str(png_file)] + elif diagram_type == "graphviz": + args = [cmd, "-Tpng", str(src_file), "-o", str(png_file)] + else: # plantuml + args = [cmd, "-tpng", str(src_file), "-o", str(tmp_path)] + png_file = tmp_path / f"diagram.{diagram_type[:3]}.png" + + subprocess.run(args, capture_output=True, timeout=30) + + if png_file.exists(): + para = doc.add_paragraph(style="Normal") + run = para.add_run() + run.add_picture(str(png_file), width=Inches(5)) + # Source-intent marker for round-trip (FR-534) + marker_para = doc.add_paragraph(style="Normal") + marker_run = marker_para.add_run( + f"{DIAGRAM_SOURCE_MARKER_PREFIX}{diagram_type}\n{source}" + ) + marker_run.font.size = Pt(1) + return + except Exception as exc: + warning_records.append( + WarningRecord( + severity=Severity.WARNING, + reason="diagram-render-failed", + construct=f"{diagram_type}: {exc}", + ) + ) + + # Fallback: source-only path + from docx.shared import Pt + + code_para = doc.add_paragraph(style="Normal") + run = code_para.add_run(f"```{diagram_type}\n{source}\n```") + run.font.name = "Courier New" + run.font.size = Pt(9) + + marker_para = doc.add_paragraph(style="Normal") + marker_run = marker_para.add_run( + f"{DIAGRAM_SOURCE_MARKER_PREFIX}{diagram_type}\n{source}" + ) + from docx.shared import Pt + + marker_run.font.size = Pt(1) + + +# --------------------------------------------------------------------------- +# Importer helpers +# --------------------------------------------------------------------------- + + +def is_diagram_source_marker(text: str) -> bool: + """Return True if *text* is a diagram source-intent marker.""" + return text.strip().startswith(DIAGRAM_SOURCE_MARKER_PREFIX) + + +def parse_diagram_source_marker(text: str) -> tuple[str, str] | None: + """Parse a diagram source-intent marker into (diagram_type, source). + + Returns None if the text is not a valid marker. + """ + stripped = text.strip() + if not stripped.startswith(DIAGRAM_SOURCE_MARKER_PREFIX): + return None + rest = stripped[len(DIAGRAM_SOURCE_MARKER_PREFIX):] + # Format: "type\nsource..." + if "\n" in rest: + diagram_type, source = rest.split("\n", 1) + return diagram_type.strip(), source + return rest.strip(), "" + + +def reconstruct_diagram_md(diagram_type: str, source: str) -> str: + """Reconstruct a fenced code block from diagram type and source.""" + return f"```{diagram_type}\n{source}\n```" diff --git a/src/markidocx/differ.py b/src/markidocx/differ.py index 0bf577f..2b459dd 100644 --- a/src/markidocx/differ.py +++ b/src/markidocx/differ.py @@ -5,6 +5,8 @@ from __future__ import annotations import re from dataclasses import dataclass, field +from markidocx.errors import OutputState + HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE) LIST_ITEM_RE = re.compile(r"^(\s*[-*+]|\s*\d+\.)\s+(.+)$", re.MULTILINE) TABLE_ROW_RE = re.compile(r"^\|.+\|$", re.MULTILINE) @@ -19,6 +21,7 @@ class DriftReport: degraded: list[str] = field(default_factory=list) broken: list[str] = field(default_factory=list) unsupported: list[str] = field(default_factory=list) + output_state: OutputState = OutputState.FINAL def compare(original: str, reimported: str) -> DriftReport: @@ -76,13 +79,29 @@ def compare(original: str, reimported: str) -> DriftReport: else: degraded.append(f"link:lost {link[:40]}") + # --- Cross-references (FR-531, FR-540) --- + _compare_xrefs(original, reimported, preserved, degraded, broken) + + # --- Figures (FR-532, FR-541) --- + _compare_figures(original, reimported, preserved, degraded, broken) + + # --- Citations & Bibliography (FR-535, FR-542) --- + from markidocx.bibliography import compare_citations + + compare_citations(original, reimported, preserved, degraded, broken) + has_drift = bool(degraded or broken) + output_state = ( + OutputState.FINAL if not has_drift + else (OutputState.DEGRADED if not broken else OutputState.PARTIAL) + ) return DriftReport( has_drift=has_drift, preserved=preserved, degraded=degraded, broken=broken, unsupported=unsupported, + output_state=output_state, ) @@ -104,6 +123,64 @@ def _count_tables(text: str) -> int: return count +def _compare_figures( + original: str, + reimported: str, + preserved: list[str], + degraded: list[str], + broken: list[str], +) -> None: + """Compare figure labels and captions (FR-532, FR-541).""" + from markidocx.figures import extract_figure_captions, extract_figure_labels + + orig_labels = extract_figure_labels(original) + reim_labels = extract_figure_labels(reimported) + for label in orig_labels: + if label in reim_labels: + preserved.append(f"figure-label:{label}") + else: + broken.append(f"figure-label:missing '{label}'") + + orig_captions = extract_figure_captions(original) + reim_captions = extract_figure_captions(reimported) + orig_set = set(orig_captions) + reim_set = set(reim_captions) + for caption in orig_set: + if caption in reim_set: + preserved.append(f"figure-caption:{caption[:40]}") + else: + degraded.append(f"figure-caption:lost '{caption[:40]}'") + + +def _compare_xrefs( + original: str, + reimported: str, + preserved: list[str], + degraded: list[str], + broken: list[str], +) -> None: + """Compare cross-reference anchors and links (FR-531, FR-540).""" + from markidocx.xref import extract_anchors, extract_xref_links + + orig_anchors = extract_anchors(original) + reim_anchors = extract_anchors(reimported) + for anchor in orig_anchors: + if anchor in reim_anchors: + preserved.append(f"xref-anchor:{anchor}") + else: + broken.append(f"xref-anchor:missing '{anchor}'") + + orig_xrefs = extract_xref_links(original) + reim_xrefs = extract_xref_links(reimported) + for link_text, anchor in orig_xrefs: + if (link_text, anchor) in reim_xrefs: + preserved.append(f"xref-link:[{link_text}][{anchor}]") + elif anchor not in reim_anchors: + broken.append(f"xref-link:broken-target [{link_text}][{anchor}]") + else: + degraded.append(f"xref-link:degraded [{link_text}][{anchor}]") + + def _compare_sets( kind: str, orig: list[str], diff --git a/src/markidocx/errors.py b/src/markidocx/errors.py new file mode 100644 index 0000000..bf1df33 --- /dev/null +++ b/src/markidocx/errors.py @@ -0,0 +1,80 @@ +"""Structured error and warning types for markidocx (FR-1201–1210).""" + +from __future__ import annotations + +from dataclasses import dataclass +from enum import StrEnum +from typing import Any + + +class Severity(StrEnum): + INFO = "info" + WARNING = "warning" + ERROR = "error" + + +class OutputState(StrEnum): + """Lifecycle state of a build/import/workflow result (FR-1210).""" + + FINAL = "final" + PARTIAL = "partial" + FALLBACK = "fallback" + DEGRADED = "degraded" + UNRESOLVED = "unresolved" + + +@dataclass +class WarningRecord: + """Structured warning record (FR-1208). + + severity: info | warning | error + reason: FR-code-aligned description + construct: the token/element that triggered the warning + """ + + severity: str + reason: str + construct: str = "" + + def to_dict(self) -> dict[str, Any]: + return { + "severity": self.severity, + "reason": self.reason, + "construct": self.construct, + } + + def __str__(self) -> str: + if self.construct: + return f"[{self.severity}] {self.reason}: {self.construct}" + return f"[{self.severity}] {self.reason}" + + +@dataclass +class FailureRecord: + """Structured failure record (FR-1209). + + severity: info | warning | error + reason: FR-code-aligned description + construct: the element that caused the failure + """ + + severity: str + reason: str + construct: str = "" + + def to_dict(self) -> dict[str, Any]: + return { + "severity": self.severity, + "reason": self.reason, + "construct": self.construct, + } + + def __str__(self) -> str: + if self.construct: + return f"[{self.severity}] {self.reason}: {self.construct}" + return f"[{self.severity}] {self.reason}" + + +def warning_records_to_strings(records: list[WarningRecord]) -> list[str]: + """Convert a list of WarningRecords to plain strings (backward compat helper).""" + return [str(r) for r in records] diff --git a/src/markidocx/figures.py b/src/markidocx/figures.py new file mode 100644 index 0000000..e973669 --- /dev/null +++ b/src/markidocx/figures.py @@ -0,0 +1,147 @@ +"""Numbered figure support for LEVEL3 markidocx (FR-532, FR-541). + +Handles round-trip of captioned numbered figures between Markdown and DOCX. + +Markdown syntax: + ![Caption text](path/to/image.png){#fig:label} + +DOCX representation: + [image paragraph or placeholder] + [caption paragraph: "Figure N — Caption text"] + (with alt-text marker: "figure-source:path/to/image.png#fig:label") +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from docx.document import Document as DocxDocument + +# Markdown figure pattern: ![Caption](path){#fig:label} +FIGURE_RE = re.compile( + r"^!\[([^\]]*)\]\(([^)]+)\)\{#(fig:[\w:-]+)\}$", + re.MULTILINE, +) + +# Caption paragraph pattern in imported DOCX +CAPTION_RE = re.compile(r"^Figure\s+(\d+)\s+[—\-–]\s+(.+)$") + +# Alt-text marker embedded in images to preserve source intent (FR-534) +ALT_TEXT_MARKER_PREFIX = "figure-source:" + + +def is_figure_paragraph(text: str) -> bool: + """Return True if *text* is a standalone figure declaration.""" + return bool(FIGURE_RE.match(text.strip())) + + +def parse_figure(text: str) -> tuple[str, str, str] | None: + """Parse a figure declaration. + + Returns (caption, path, label) or None. + """ + m = FIGURE_RE.match(text.strip()) + if not m: + return None + return m.group(1), m.group(2), m.group(3) + + +def render_figure( + doc: DocxDocument, + caption: str, + path: str, + label: str, + figure_number: int, +) -> None: + """Render a figure declaration into *doc* (FR-532). + + Adds: + 1. A paragraph with alt-text marker (image placeholder — actual embedding + requires the file to exist and is omitted here for portability). + 2. A caption paragraph: "Figure N — Caption" + """ + # Alt-text marker so importer can reconstruct the figure (FR-534) + alt_marker = f"{ALT_TEXT_MARKER_PREFIX}{path}#{label}" + + # Image placeholder paragraph with alt-text marker as text + placeholder = doc.add_paragraph(style="Normal") + run = placeholder.add_run(f"[Figure: {path}]") + # Store source-intent in the run's text (alt-text equivalent for round-trip) + run.italic = True + + # Add DOCX comment/marker paragraph with the source-intent data + marker_para = doc.add_paragraph(style="Normal") + marker_run = marker_para.add_run(alt_marker) + marker_run.font.size = None # inherit + # Hide the marker by making it very small (conceptual; keeps round-trip info) + from docx.shared import Pt + + marker_run.font.size = Pt(1) + marker_run.font.color.rgb = None # default color + + # Caption paragraph + caption_para = doc.add_paragraph(style="Normal") + caption_para.add_run(f"Figure {figure_number} — {caption}") + + +def extract_figures_from_md(text: str) -> list[tuple[str, str, str]]: + """Extract all figure declarations from Markdown text. + + Returns list of (caption, path, label). + """ + return [(m.group(1), m.group(2), m.group(3)) for m in FIGURE_RE.finditer(text)] + + +# --------------------------------------------------------------------------- +# Importer helpers +# --------------------------------------------------------------------------- + + +def is_caption_paragraph(text: str) -> bool: + """Return True if *text* looks like a figure caption.""" + return bool(CAPTION_RE.match(text.strip())) + + +def is_alt_text_marker(text: str) -> bool: + """Return True if *text* is a figure-source alt-text marker.""" + return text.strip().startswith(ALT_TEXT_MARKER_PREFIX) + + +def parse_alt_text_marker(text: str) -> tuple[str, str] | None: + """Parse a figure-source marker into (path, label). + + Returns None if the text is not a valid marker. + """ + stripped = text.strip() + if not stripped.startswith(ALT_TEXT_MARKER_PREFIX): + return None + rest = stripped[len(ALT_TEXT_MARKER_PREFIX):] + if "#" in rest: + path, label = rest.rsplit("#", 1) + return path, label + return rest, "" + + +def reconstruct_figure_md(caption: str, path: str, label: str) -> str: + """Reconstruct a Markdown figure declaration from its parts.""" + return f"![{caption}]({path}){{#{label}}}" + + +# --------------------------------------------------------------------------- +# Differ helpers +# --------------------------------------------------------------------------- + +FIGURE_LABEL_RE = re.compile(r"\{#(fig:[\w:-]+)\}") +FIGURE_CAPTION_MD_RE = re.compile(r"!\[([^\]]*)\]\([^)]+\)\{#fig:[\w:-]+\}") + + +def extract_figure_labels(text: str) -> set[str]: + """Extract {#fig:label} declarations from Markdown text.""" + return set(FIGURE_LABEL_RE.findall(text)) + + +def extract_figure_captions(text: str) -> list[str]: + """Extract captions from figure declarations in Markdown text.""" + return [m.group(1) for m in FIGURE_CAPTION_MD_RE.finditer(text)] diff --git a/src/markidocx/importer.py b/src/markidocx/importer.py index e55fcfe..b8a490c 100644 --- a/src/markidocx/importer.py +++ b/src/markidocx/importer.py @@ -11,6 +11,7 @@ from docx.document import Document as DocxDocument from docx.table import Table from docx.text.paragraph import Paragraph +from markidocx.errors import OutputState, Severity, WarningRecord from markidocx.manifest import Manifest HEADING_STYLE_RE = re.compile(r"^Heading (\d+)$", re.IGNORECASE) @@ -23,7 +24,13 @@ class ImportResult: success: bool output_files: list[Path] mapping_status: str # "redistributed" | "merged" | "failed" - warnings: list[str] = field(default_factory=list) + warning_records: list[WarningRecord] = field(default_factory=list) + output_state: OutputState = OutputState.FINAL + + @property + def warnings(self) -> list[str]: + """Backward-compatible string view of warning_records.""" + return [str(w) for w in self.warning_records] def import_document(manifest: Manifest, docx_path: Path) -> ImportResult: @@ -33,14 +40,21 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult: content is redistributed to the original files. Otherwise a single merged file is produced. """ - warnings: list[str] = [] + warning_records: list[WarningRecord] = [] if not docx_path.exists(): return ImportResult( success=False, output_files=[], mapping_status="failed", - warnings=[f"DOCX file not found: {docx_path}"], + warning_records=[ + WarningRecord( + severity=Severity.ERROR, + reason="docx-not-found", + construct=str(docx_path), + ) + ], + output_state=OutputState.UNRESOLVED, ) try: @@ -50,10 +64,17 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult: success=False, output_files=[], mapping_status="failed", - warnings=[f"Could not open DOCX: {exc}"], + warning_records=[ + WarningRecord( + severity=Severity.ERROR, + reason="docx-open-failed", + construct=str(exc), + ) + ], + output_state=OutputState.UNRESOLVED, ) - md_text = _docx_to_markdown(doc, warnings) + md_text = _docx_to_markdown(doc, warning_records) manifest.output_dir.mkdir(parents=True, exist_ok=True) @@ -65,7 +86,8 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult: success=True, output_files=[out_path], mapping_status="redistributed", - warnings=warnings, + warning_records=warning_records, + output_state=OutputState.FINAL, ) # Multi-file: attempt redistribution by H1 boundary @@ -79,13 +101,20 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult: success=True, output_files=output_files, mapping_status="redistributed", - warnings=warnings, + warning_records=warning_records, + output_state=OutputState.FINAL, ) - # Fallback: merged single output (FR-406) - warnings.append( - f"Could not redistribute to {len(manifest.sources)} source files " - f"(found {len(sections)} H1 sections); writing merged output" + # Fallback: merged single output (FR-406, FR-1207) + warning_records.append( + WarningRecord( + severity=Severity.WARNING, + reason="fallback", + construct=( + f"could not redistribute to {len(manifest.sources)} source files " + f"(found {len(sections)} H1 sections); writing merged output" + ), + ) ) merged_path = manifest.output_dir / "imported_merged.md" merged_path.write_text(md_text, encoding="utf-8") @@ -93,7 +122,8 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult: success=True, output_files=[merged_path], mapping_status="merged", - warnings=warnings, + warning_records=warning_records, + output_state=OutputState.FALLBACK, ) @@ -101,17 +131,95 @@ def import_document(manifest: Manifest, docx_path: Path) -> ImportResult: # DOCX → Markdown conversion # --------------------------------------------------------------------------- -def _docx_to_markdown(doc: DocxDocument, warnings: list[str]) -> str: +def _docx_to_markdown(doc: DocxDocument, warning_records: list[WarningRecord]) -> str: """Convert a python-docx Document to a Markdown string.""" + from markidocx.bibliography import ( + is_bibliography_entry, + is_bibliography_marker, + restore_citations_in_text, + ) + from markidocx.diagrams import ( + is_diagram_source_marker, + parse_diagram_source_marker, + reconstruct_diagram_md, + ) + from markidocx.figures import ( + CAPTION_RE, + is_alt_text_marker, + parse_alt_text_marker, + reconstruct_figure_md, + ) + lines: list[str] = [] + # Walk python-docx's block-level items - for block in _iter_blocks(doc): + blocks = list(_iter_blocks(doc)) + idx = 0 + while idx < len(blocks): + block = blocks[idx] if isinstance(block, Paragraph): - md = _paragraph_to_md(block, warnings) + text = block.text.strip() + + # Detect diagram source-intent marker (tiny font) → restore fenced block (FR-534) + if is_diagram_source_marker(text): + parsed = parse_diagram_source_marker(text) + if parsed: + diagram_type, source = parsed + from markidocx.diagrams import reconstruct_diagram_md + lines.append(reconstruct_diagram_md(diagram_type, source)) + idx += 1 + continue + + # Detect alt-text marker (figure source intent) — skip it; consumed by caption + if is_alt_text_marker(text): + caption_text = "" + path = "" + label = "" + marker_parsed = parse_alt_text_marker(text) + if marker_parsed: + path, label = marker_parsed + + if idx + 1 < len(blocks) and isinstance(blocks[idx + 1], Paragraph): + next_text = blocks[idx + 1].text.strip() + cm = CAPTION_RE.match(next_text) + if cm: + caption_text = cm.group(2) + idx += 1 # consume caption paragraph + + if caption_text: + lines.append(reconstruct_figure_md(caption_text, path, label)) + idx += 1 + continue + + # Detect placeholder + alt-text marker pattern: "[Figure: path]" + if text.startswith("[Figure:") and text.endswith("]"): + idx += 1 + continue # skip placeholder; handled via alt-text marker + + # Detect bibliography section marker (tiny invisible paragraph) + if is_bibliography_marker(text): + idx += 1 + continue # skip; section already started by heading + + # Detect bibliography reference entry ([@key]: ...) — already in correct format + if is_bibliography_entry(text): + lines.append(text) + idx += 1 + continue + + md = _paragraph_to_md(block, warning_records) if md is not None: lines.append(md) elif isinstance(block, Table): lines.append(_table_to_md(block)) + idx += 1 + + # Bibliography entries are already inline after heading; no extra work needed + result_text = "\n\n".join(line for line in lines if line is not None) + + # Restore citations in the text ([@key] markers) + result_text = restore_citations_in_text(result_text) + return result_text return "\n\n".join(line for line in lines if line is not None) @@ -128,7 +236,7 @@ def _iter_blocks(doc: DocxDocument): yield Table(child, doc) -def _paragraph_to_md(para: Paragraph, warnings: list[str]) -> str | None: +def _paragraph_to_md(para: Paragraph, warning_records: list[WarningRecord]) -> str | None: """Convert a paragraph to a Markdown line.""" style_name = para.style.name if para.style else "Normal" text = para.text.strip() @@ -137,7 +245,14 @@ def _paragraph_to_md(para: Paragraph, warnings: list[str]) -> str | None: m = HEADING_STYLE_RE.match(style_name) if m: level = int(m.group(1)) - return f"{'#' * level} {text}" + # Check for bookmarks → restore {#anchor} labels (FR-531) + from markidocx.xref import extract_bookmarks_from_paragraph + + bookmarks = extract_bookmarks_from_paragraph(para) + anchor_suffix = "" + if bookmarks: + anchor_suffix = " " + " ".join(f"{{#{b}}}" for b in bookmarks) + return f"{'#' * level} {text}{anchor_suffix}" # Lists if LIST_BULLET_RE.match(style_name): @@ -145,13 +260,33 @@ def _paragraph_to_md(para: Paragraph, warnings: list[str]) -> str | None: if LIST_NUMBER_RE.match(style_name): return f"1. {text}" - # Normal text — preserve inline markup + # Normal text — check for internal hyperlinks (cross-refs) → [text][anchor] + from markidocx.xref import extract_internal_hyperlinks_from_paragraph + + internal_links = extract_internal_hyperlinks_from_paragraph(para) + if internal_links: + return _runs_to_md_with_xrefs(para, internal_links) + if not text: return None return _runs_to_md(para) +def _runs_to_md_with_xrefs( + para: Paragraph, internal_links: list[tuple[str, str]] +) -> str: + """Convert paragraph with internal hyperlinks to Markdown with [text][anchor]. + + para.text includes text from nested hyperlink elements, so we use it as + the base and replace each hyperlink text with [text][anchor] syntax. + """ + result = para.text + for link_text, anchor in internal_links: + result = result.replace(link_text, f"[{link_text}][{anchor}]", 1) + return result + + def _runs_to_md(para: Paragraph) -> str: """Convert paragraph runs to Markdown with inline formatting.""" parts: list[str] = [] diff --git a/src/markidocx/level3.py b/src/markidocx/level3.py new file mode 100644 index 0000000..0aa26fa --- /dev/null +++ b/src/markidocx/level3.py @@ -0,0 +1,83 @@ +"""LEVEL3 feature gating, processor-dependency disclosure, and support detection (FR-537–539).""" + +from __future__ import annotations + +import shutil +from dataclasses import dataclass, field + +# Diagram renderers recognised by LEVEL3 auto-diagram support +_DIAGRAM_TOOLS: dict[str, str] = { + "mmdc": "Mermaid CLI (mermaid diagrams)", + "dot": "Graphviz dot (graphviz diagrams)", + "plantuml": "PlantUML (plantuml diagrams)", +} + + +@dataclass +class ProcessorDependency: + """An external tool required for a LEVEL3 construct (FR-538).""" + + name: str + description: str + available: bool + + +@dataclass +class Level3Support: + """Summary of LEVEL3 processing capability on the current host (FR-537, FR-538). + + available: True if *any* LEVEL3 processing is possible (always True — + core features like cross-refs / figures / bibliography work + without external tools). + dependencies: per-tool availability for diagram rendering. + partial: True when some LEVEL3 features are unavailable due to missing tools. + missing_coverage: human-readable list of unavailable feature areas. + """ + + available: bool = True + dependencies: list[ProcessorDependency] = field(default_factory=list) + partial: bool = False + missing_coverage: list[str] = field(default_factory=list) + + +def check_level3_support() -> Level3Support: + """Detect external tool availability and compute Level3Support (FR-537, FR-538). + + Core LEVEL3 features (cross-refs, figures, bibliography) are always available. + Diagram rendering requires external tools (mmdc / dot / plantuml). + """ + deps: list[ProcessorDependency] = [] + for cmd, description in _DIAGRAM_TOOLS.items(): + available = shutil.which(cmd) is not None + deps.append(ProcessorDependency(name=cmd, description=description, available=available)) + + diagram_available = any(d.available for d in deps) + missing: list[str] = [] + if not diagram_available: + missing.append("auto-diagrams (no renderer: mmdc/dot/plantuml not found)") + + return Level3Support( + available=True, + dependencies=deps, + partial=bool(missing), + missing_coverage=missing, + ) + + +def capabilities_entry() -> dict: + """Return a capabilities dict fragment for LEVEL3 (FR-537).""" + support = check_level3_support() + return { + "level": "level3", + "available": support.available, + "partial": support.partial, + "missing_coverage": support.missing_coverage, + "dependencies": [ + { + "name": d.name, + "description": d.description, + "available": d.available, + } + for d in support.dependencies + ], + } diff --git a/src/markidocx/mcp_server.py b/src/markidocx/mcp_server.py index 34cdbd0..3f2361a 100644 --- a/src/markidocx/mcp_server.py +++ b/src/markidocx/mcp_server.py @@ -68,6 +68,8 @@ def validate_project(manifest_yaml: str) -> dict[str, Any]: except Exception: (tmp_path / "dist").mkdir(exist_ok=True) try: + from markidocx.level3 import capabilities_entry as level3_capabilities + m = load_manifest(mp) return { "status": "ok", @@ -79,6 +81,7 @@ def validate_project(manifest_yaml: str) -> dict[str, Any]: "context": { "supported_families": sorted(SUPPORTED_FAMILIES), "supported_feature_levels": [e.value for e in FeatureLevel], + "level3": level3_capabilities(), }, } except ManifestError as exc: @@ -123,15 +126,24 @@ def build(manifest_yaml: str, sources: list[dict[str, str]]) -> dict[str, Any]: result = build_document(m) if result.success: docx_b64 = base64.b64encode(Path(result.output_path).read_bytes()).decode() - return { + out: dict[str, Any] = { "status": "ok", "docx_base64": docx_b64, "family": result.family, "feature_level": result.feature_level, - "warnings": result.warnings, + "output_state": result.output_state, + "warnings": [w.to_dict() for w in result.warning_records], "errors": [], } - return {"status": "error", "errors": result.errors, "warnings": result.warnings} + if result.partial_level3: + out["partial_level3"] = True + out["missing_coverage"] = result.missing_coverage + return out + return { + "status": "error", + "errors": result.errors, + "warnings": [w.to_dict() for w in result.warning_records], + } @mcp.tool() @@ -182,10 +194,15 @@ def import_docx(manifest_yaml: str, docx_base64: str) -> dict[str, Any]: "status": "ok", "files": files_md, "mapping_status": result.mapping_status, - "warnings": result.warnings, + "output_state": result.output_state, + "warnings": [w.to_dict() for w in result.warning_records], "errors": [], } - return {"status": "error", "errors": ["Import failed"], "warnings": result.warnings} + return { + "status": "error", + "errors": ["Import failed"], + "warnings": [w.to_dict() for w in result.warning_records], + } @mcp.tool() @@ -329,14 +346,17 @@ def get_evidence(run_id: str) -> dict[str, Any]: @mcp.resource("markidocx://capabilities") def resource_capabilities() -> str: - """Capabilities: supported feature levels and families.""" + """Capabilities: supported feature levels and families (FR-537).""" import json + from markidocx.level3 import capabilities_entry as level3_capabilities + return json.dumps( { "version": __version__, "feature_levels": [e.value for e in FeatureLevel], "families": sorted(SUPPORTED_FAMILIES), + "level3": level3_capabilities(), } ) diff --git a/src/markidocx/rest.py b/src/markidocx/rest.py index 0f97438..e266026 100644 --- a/src/markidocx/rest.py +++ b/src/markidocx/rest.py @@ -23,14 +23,14 @@ from markidocx.templates import FamilyRegistry class ResponseEnvelope(BaseModel): status: str outputs: Any = None - warnings: list[str] = [] + warnings: list[Any] = [] # list[WarningRecord.to_dict()] or list[str] (FR-1208) errors: list[str] = [] context: dict[str, Any] = {} def _ok( outputs: Any = None, - warnings: list[str] | None = None, + warnings: list[Any] | None = None, context: dict[str, Any] | None = None, ) -> ResponseEnvelope: return ResponseEnvelope( @@ -44,7 +44,7 @@ def _ok( def _error( errors: list[str], - warnings: list[str] | None = None, + warnings: list[Any] | None = None, context: dict[str, Any] | None = None, ) -> ResponseEnvelope: return ResponseEnvelope( @@ -158,11 +158,14 @@ def create_app() -> FastAPI: @app.get("/capabilities", response_model=ResponseEnvelope) def capabilities() -> ResponseEnvelope: - """Capability inspection — feature levels and families (FR-909).""" + """Capability inspection — feature levels and families (FR-909, FR-537).""" + from markidocx.level3 import capabilities_entry as level3_capabilities + return _ok( outputs={ "feature_levels": [e.value for e in FeatureLevel], "families": sorted(SUPPORTED_FAMILIES), + "level3": level3_capabilities(), }, context={"version": __version__}, ) @@ -227,17 +230,29 @@ def create_app() -> FastAPI: **req.context, "family": result.family, "feature_level": result.feature_level, + "output_state": result.output_state, } if result.success: docx_b64 = base64.b64encode(Path(result.output_path).read_bytes()).decode() + outputs: dict[str, Any] = { + "docx_base64": docx_b64, + "output_path": str(result.output_path), + } + if result.partial_level3: + outputs["partial_level3"] = True + outputs["missing_coverage"] = result.missing_coverage return ResponseEnvelope( status="ok", - outputs={"docx_base64": docx_b64, "output_path": str(result.output_path)}, - warnings=result.warnings, + outputs=outputs, + warnings=[w.to_dict() for w in result.warning_records], errors=[], context=ctx, ) - return _error(errors=result.errors, warnings=result.warnings, context=ctx) + return _error( + errors=result.errors, + warnings=[w.to_dict() for w in result.warning_records], + context=ctx, + ) @app.post("/import", response_model=ResponseEnvelope) def import_docx(req: ImportRequest) -> ResponseEnvelope: @@ -255,7 +270,7 @@ def create_app() -> FastAPI: except ManifestError as exc: return _error(errors=[str(exc)], context=req.context) result = import_document(m, docx_path) - ctx = {**req.context} + ctx = {**req.context, "output_state": result.output_state} if result.success: import contextlib @@ -266,14 +281,14 @@ def create_app() -> FastAPI: return ResponseEnvelope( status="ok", outputs={"files": files_md, "mapping_status": result.mapping_status}, - warnings=result.warnings, + warnings=[w.to_dict() for w in result.warning_records], errors=[], context=ctx, ) return ResponseEnvelope( status="error", outputs=None, - warnings=result.warnings, + warnings=[w.to_dict() for w in result.warning_records], errors=["Import failed"], context=ctx, ) diff --git a/src/markidocx/xref.py b/src/markidocx/xref.py new file mode 100644 index 0000000..befdef4 --- /dev/null +++ b/src/markidocx/xref.py @@ -0,0 +1,159 @@ +"""Cross-reference support for LEVEL3 markidocx (FR-531, FR-540). + +Handles the round-trip of heading anchors ({#anchor}) and cross-reference +links ([text][anchor]) between Markdown and DOCX bookmarks/hyperlinks. +""" + +from __future__ import annotations + +import re +from typing import TYPE_CHECKING + +from lxml import etree + +if TYPE_CHECKING: + from docx.text.paragraph import Paragraph as DocxParagraph + +# Markdown patterns +ANCHOR_LABEL_RE = re.compile(r"\s*\{#([\w-]+)\}\s*$") +XREF_LINK_RE = re.compile(r"\[([^\]]+)\]\[([\w-]+)\]") + +# DOCX XML namespaces +_W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" +_R = "http://schemas.openxmlformats.org/officeDocument/2006/relationships" + + +def extract_anchor_from_heading(text: str) -> tuple[str, str | None]: + """Strip `{#anchor}` suffix from heading text. + + Returns (clean_text, anchor_name) — anchor_name is None if no anchor present. + """ + m = ANCHOR_LABEL_RE.search(text) + if m: + anchor = m.group(1) + clean = text[: m.start()] + return clean, anchor + return text, None + + +def add_bookmark_to_paragraph(para: DocxParagraph, bookmark_name: str, bookmark_id: int) -> None: + """Insert a bookmark start/end pair into a paragraph's XML (FR-531).""" + p_elem = para._p # lxml element + + # + bm_start = etree.SubElement(p_elem, f"{{{_W}}}bookmarkStart") + bm_start.set(f"{{{_W}}}id", str(bookmark_id)) + bm_start.set(f"{{{_W}}}name", bookmark_name) + + # + bm_end = etree.SubElement(p_elem, f"{{{_W}}}bookmarkEnd") + bm_end.set(f"{{{_W}}}id", str(bookmark_id)) + + +def add_internal_hyperlink(para: DocxParagraph, text: str, anchor: str) -> None: + """Add an internal hyperlink run pointing to a bookmark anchor (FR-531). + + Inserts a element with a run. + """ + p_elem = para._p + + hyperlink = etree.SubElement(p_elem, f"{{{_W}}}hyperlink") + hyperlink.set(f"{{{_W}}}anchor", anchor) + + run = etree.SubElement(hyperlink, f"{{{_W}}}r") + rpr = etree.SubElement(run, f"{{{_W}}}rPr") + style = etree.SubElement(rpr, f"{{{_W}}}rStyle") + style.set(f"{{{_W}}}val", "Hyperlink") + t = etree.SubElement(run, f"{{{_W}}}t") + t.text = text + if text and (text[0] == " " or text[-1] == " "): + t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve") + + +def render_paragraph_with_xrefs( + para: DocxParagraph, + text: str, + known_anchors: set[str], +) -> None: + """Render paragraph text, converting [text][anchor] to internal hyperlinks (FR-531). + + Falls back to plain text for references to unknown anchors. + """ + last_end = 0 + for m in XREF_LINK_RE.finditer(text): + link_text = m.group(1) + anchor = m.group(2) + # Add plain text before this match + if m.start() > last_end: + para.add_run(text[last_end : m.start()]) + if anchor in known_anchors: + add_internal_hyperlink(para, link_text, anchor) + else: + # Unknown anchor — render as plain text with a note + para.add_run(f"{link_text} [→{anchor}]") + last_end = m.end() + # Remaining text + if last_end < len(text): + para.add_run(text[last_end:]) + + +def has_xref_links(text: str) -> bool: + """Return True if *text* contains any [text][anchor] patterns.""" + return bool(XREF_LINK_RE.search(text)) + + +# --------------------------------------------------------------------------- +# Importer helpers +# --------------------------------------------------------------------------- + + +def extract_bookmarks_from_paragraph(para: DocxParagraph) -> list[str]: + """Return bookmark names declared in the paragraph's XML.""" + bookmarks: list[str] = [] + for elem in para._p.iter(f"{{{_W}}}bookmarkStart"): + name = elem.get(f"{{{_W}}}name", "") + # Ignore Word-internal bookmarks (e.g. _GoBack) + if name and not name.startswith("_"): + bookmarks.append(name) + return bookmarks + + +def extract_internal_hyperlinks_from_paragraph( + para: DocxParagraph, +) -> list[tuple[str, str]]: + """Return (text, anchor) pairs for internal hyperlinks in *para*. + + Internal = (no relationship ID). + """ + links: list[tuple[str, str]] = [] + for hl in para._p.iter(f"{{{_W}}}hyperlink"): + anchor = hl.get(f"{{{_W}}}anchor") + # Only internal bookmark hyperlinks (no r:id) + if anchor and not hl.get(f"{{{_R}}}id"): + # Collect run text + parts: list[str] = [] + for t_elem in hl.iter(f"{{{_W}}}t"): + if t_elem.text: + parts.append(t_elem.text) + text = "".join(parts) + if text: + links.append((text, anchor)) + return links + + +# --------------------------------------------------------------------------- +# Differ helpers +# --------------------------------------------------------------------------- + +XREF_ANCHOR_RE = re.compile(r"\{#([\w-]+)\}") +XREF_LINK_PATTERN = re.compile(r"\[([^\]]+)\]\[([\w-]+)\]") + + +def extract_anchors(text: str) -> set[str]: + """Extract all {#anchor} declarations from Markdown text.""" + return set(XREF_ANCHOR_RE.findall(text)) + + +def extract_xref_links(text: str) -> set[tuple[str, str]]: + """Extract all (text, anchor) cross-ref link pairs from Markdown text.""" + return {(m.group(1), m.group(2)) for m in XREF_LINK_PATTERN.finditer(text)} diff --git a/tests/regression/level3/__init__.py b/tests/regression/level3/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/regression/level3/bibliography_document.md b/tests/regression/level3/bibliography_document.md new file mode 100644 index 0000000..a785d4b --- /dev/null +++ b/tests/regression/level3/bibliography_document.md @@ -0,0 +1,35 @@ +# Research Document with Citations + +## Introduction + +Prior work by [@smith2020] established the foundation. The approach was later +refined by [@jones2021], building on the original insights of [@smith2020]. + +## Related Work + +Several key contributions inform this work. The landmark paper [@brown2019] +introduced the core technique. Further development appeared in [@davis2022] +and [@wilson2023]. + +## Methodology + +Based on [@smith2020] and the refinements of [@jones2021], our methodology +proceeds as follows. + +## Results + +Our results confirm the predictions of [@brown2019] and extend the findings +of [@davis2022]. + +## Conclusion + +This work synthesises [@smith2020], [@jones2021], [@brown2019], [@davis2022], +and [@wilson2023]. + +## References + +- [@smith2020]: Smith, J. *Foundational Work*. Journal of Research, 2020. +- [@jones2021]: Jones, B. *Refinements and Extensions*. Proceedings, 2021. +- [@brown2019]: Brown, C. *The Core Technique*. Nature, 2019. +- [@davis2022]: Davis, A. *Further Development*. Science, 2022. +- [@wilson2023]: Wilson, E. *Recent Advances*. Review, 2023. diff --git a/tests/regression/level3/combined_document.md b/tests/regression/level3/combined_document.md new file mode 100644 index 0000000..79b5f73 --- /dev/null +++ b/tests/regression/level3/combined_document.md @@ -0,0 +1,63 @@ +# Combined LEVEL3 Feature Document {#combined} + +This document exercises all LEVEL3 constructs in a single file. + +## Introduction {#intro} + +This document demonstrates the full LEVEL3 feature set as described by [@smith2020]. +See [Background][bg] for context. + +## Background {#bg} + +Context and prerequisites are discussed here. Refer to [Introduction][intro] +for the problem statement. + +## Architecture {#arch-section} + +The system architecture is shown below. + +![System Architecture](arch.png){#fig:arch} + +The architecture overview in [Architecture][arch-section] establishes the +baseline from which the data flow is derived. + +## Data Flow + +The data flow diagram illustrates message routing. + +```mermaid +graph LR + A[Input] --> B[Processor] + B --> C[Output] +``` + +## Algorithm {#algo} + +The algorithm formalises the approach described in [@jones2021]. + +```graphviz +digraph algorithm { + start -> step1 -> step2 -> end; +} +``` + +## Results {#results} + +Experimental results confirm the algorithm in [Algorithm][algo]. + +![Experimental Results](results.png){#fig:results} + +The results align with predictions from [@brown2019] and the architectural +choices described in [Architecture][arch-section]. + +## Conclusion {#conclusion} + +All LEVEL3 constructs — cross-references, figures, diagrams, and citations — +have been demonstrated. See [Introduction][intro] through [Results][results] +for the complete narrative. + +## References + +- [@smith2020]: Smith, J. *LEVEL3 Design Principles*. 2020. +- [@jones2021]: Jones, B. *Algorithm Formalisation*. 2021. +- [@brown2019]: Brown, C. *Experimental Validation*. 2019. diff --git a/tests/regression/level3/diagrams_document.md b/tests/regression/level3/diagrams_document.md new file mode 100644 index 0000000..ba196af --- /dev/null +++ b/tests/regression/level3/diagrams_document.md @@ -0,0 +1,44 @@ +# Document with Diagram Sources + +## State Machine + +The following Mermaid diagram describes the state machine: + +```mermaid +stateDiagram-v2 + [*] --> Idle + Idle --> Processing: start + Processing --> Done: complete + Processing --> Error: fail + Done --> [*] + Error --> Idle: reset +``` + +## Dependency Graph + +The Graphviz diagram shows dependencies: + +```graphviz +digraph G { + A -> B; + A -> C; + B -> D; + C -> D; +} +``` + +## Sequence + +The PlantUML sequence diagram: + +```plantuml +@startuml +Alice -> Bob: Request +Bob --> Alice: Response +Alice -> Carol: Forward +@enduml +``` + +## Summary + +All three diagram types are supported in LEVEL3 source-only mode. diff --git a/tests/regression/level3/figures_document.md b/tests/regression/level3/figures_document.md new file mode 100644 index 0000000..ec46b59 --- /dev/null +++ b/tests/regression/level3/figures_document.md @@ -0,0 +1,29 @@ +# Technical Report with Figures + +## Overview + +This document contains multiple numbered figures for LEVEL3 round-trip testing. + +## System Architecture + +The overall architecture is illustrated below. + +![System Architecture Overview](figures/architecture.png){#fig:arch} + +The architecture shows the main components and their interactions. + +## Data Flow + +The data flow is shown in the following figure. + +![Data Flow Diagram](figures/dataflow.png){#fig:dataflow} + +Compare the architecture in [fig:arch] with the data flow above. + +## Results + +Final results are captured in this chart. + +![Results Summary Chart](figures/results.png){#fig:results} + +The chart confirms the findings from the data flow in Figure 2. diff --git a/tests/regression/level3/xref_document.md b/tests/regression/level3/xref_document.md new file mode 100644 index 0000000..35b7688 --- /dev/null +++ b/tests/regression/level3/xref_document.md @@ -0,0 +1,21 @@ +# Introduction {#intro} + +This document demonstrates cross-reference support for LEVEL3 processing. + +## Background {#bg} + +The background section provides context. See [Introduction][intro] for the overview. + +## Methodology {#method} + +This section describes the approach. Refer to [Background][bg] for prerequisites, +and see [Introduction][intro] for the original problem statement. + +## Results {#results} + +Results are discussed here. The methodology in [Methodology][method] led to these findings. + +## Conclusion + +This concludes the document. All sections from [Introduction][intro] through +[Results][results] have been covered. diff --git a/tests/regression/test_level3_roundtrip.py b/tests/regression/test_level3_roundtrip.py new file mode 100644 index 0000000..2ace8aa --- /dev/null +++ b/tests/regression/test_level3_roundtrip.py @@ -0,0 +1,261 @@ +"""LEVEL3 end-to-end round-trip regression tests (FR-1100, MRKD-WP-0003 T07). + +Tests the full build → import → compare cycle for each corpus file in +tests/regression/level3/, using feature_level: level3. + +All LEVEL1 regression tests must remain green (non-regression gate). +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +import yaml + +from markidocx.builder import build_document +from markidocx.differ import compare +from markidocx.importer import import_document +from markidocx.manifest import load_manifest + +# Corpus files in tests/regression/level3/ +CORPUS_DIR = Path(__file__).parent / "level3" +CORPUS_FILES = [ + "xref_document.md", + "figures_document.md", + "diagrams_document.md", + "bibliography_document.md", + "combined_document.md", +] + + +def _make_level3_project(tmp_path: Path, markdown: str, name: str = "test") -> Path: + (tmp_path / "doc.md").write_text(markdown, encoding="utf-8") + manifest_path = tmp_path / "manifest.yaml" + manifest_path.write_text( + yaml.dump( + { + "project": {"name": name, "feature_level": "level3", "family": "article"}, + "sources": [{"path": "doc.md"}], + "output": {"dir": "./dist"}, + } + ) + ) + (tmp_path / "dist").mkdir() + return manifest_path + + +# --------------------------------------------------------------------------- +# Corpus round-trip tests +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("corpus_file", CORPUS_FILES) +def test_level3_corpus_builds(tmp_path: Path, corpus_file: str) -> None: + """Each corpus file builds successfully under LEVEL3.""" + md = (CORPUS_DIR / corpus_file).read_text(encoding="utf-8") + manifest_path = _make_level3_project(tmp_path, md, name=corpus_file.replace(".md", "")) + manifest = load_manifest(manifest_path) + + result = build_document(manifest) + assert result.success, f"Build failed for {corpus_file}: {result.errors}" + assert result.output_path.exists() + assert result.feature_level == "level3" + + +@pytest.mark.parametrize("corpus_file", CORPUS_FILES) +def test_level3_corpus_imports(tmp_path: Path, corpus_file: str) -> None: + """Each corpus file imports successfully after build.""" + md = (CORPUS_DIR / corpus_file).read_text(encoding="utf-8") + manifest_path = _make_level3_project(tmp_path, md, name=corpus_file.replace(".md", "")) + manifest = load_manifest(manifest_path) + + build_result = build_document(manifest) + assert build_result.success, f"Build failed for {corpus_file}" + + import_result = import_document(manifest, build_result.output_path) + assert import_result.success, f"Import failed for {corpus_file}: {import_result.warnings}" + + +@pytest.mark.parametrize("corpus_file", CORPUS_FILES) +def test_level3_corpus_no_unexpected_breakage(tmp_path: Path, corpus_file: str) -> None: + """Round-trip diff for each corpus file has no broken headings.""" + md = (CORPUS_DIR / corpus_file).read_text(encoding="utf-8") + manifest_path = _make_level3_project(tmp_path, md, name=corpus_file.replace(".md", "")) + manifest = load_manifest(manifest_path) + + build_result = build_document(manifest) + assert build_result.success + + import_result = import_document(manifest, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + report = compare(md, reimported) + + # Headings must not be broken + broken_headings = [b for b in report.broken if b.startswith("heading:")] + assert not broken_headings, ( + f"Broken headings in {corpus_file}: {broken_headings}" + ) + + +# --------------------------------------------------------------------------- +# Specific corpus: xref_document — cross-ref anchors preserved +# --------------------------------------------------------------------------- + + +def test_xref_document_anchors_preserved(tmp_path: Path) -> None: + md = (CORPUS_DIR / "xref_document.md").read_text(encoding="utf-8") + manifest_path = _make_level3_project(tmp_path, md, name="xref") + manifest = load_manifest(manifest_path) + + build_result = build_document(manifest) + assert build_result.success + + import_result = import_document(manifest, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + # Core anchors must survive + assert "{#intro}" in reimported + assert "{#bg}" in reimported + assert "{#method}" in reimported + assert "{#results}" in reimported + + +# --------------------------------------------------------------------------- +# Specific corpus: figures_document — figure labels preserved +# --------------------------------------------------------------------------- + + +def test_figures_document_labels_preserved(tmp_path: Path) -> None: + md = (CORPUS_DIR / "figures_document.md").read_text(encoding="utf-8") + manifest_path = _make_level3_project(tmp_path, md, name="figures") + manifest = load_manifest(manifest_path) + + build_result = build_document(manifest) + assert build_result.success + + import_result = import_document(manifest, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "fig:arch" in reimported + assert "fig:dataflow" in reimported + assert "fig:results" in reimported + + +# --------------------------------------------------------------------------- +# Specific corpus: diagrams_document — diagram sources preserved +# --------------------------------------------------------------------------- + + +def test_diagrams_document_sources_preserved(tmp_path: Path, monkeypatch) -> None: + """Diagram sources survive round-trip in source-only path.""" + import shutil + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + md = (CORPUS_DIR / "diagrams_document.md").read_text(encoding="utf-8") + manifest_path = _make_level3_project(tmp_path, md, name="diagrams") + manifest = load_manifest(manifest_path) + + build_result = build_document(manifest) + assert build_result.success + + import_result = import_document(manifest, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + # At least one diagram type must appear in reimported + assert "mermaid" in reimported or "graphviz" in reimported or "plantuml" in reimported + + +# --------------------------------------------------------------------------- +# Specific corpus: bibliography_document — citation keys preserved +# --------------------------------------------------------------------------- + + +def test_bibliography_document_citations_preserved(tmp_path: Path) -> None: + md = (CORPUS_DIR / "bibliography_document.md").read_text(encoding="utf-8") + manifest_path = _make_level3_project(tmp_path, md, name="bibliography") + manifest = load_manifest(manifest_path) + + build_result = build_document(manifest) + assert build_result.success + + import_result = import_document(manifest, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "smith2020" in reimported + assert "jones2021" in reimported + assert "brown2019" in reimported + + +# --------------------------------------------------------------------------- +# Specific corpus: combined_document — all LEVEL3 constructs +# --------------------------------------------------------------------------- + + +def test_combined_document_roundtrip(tmp_path: Path, monkeypatch) -> None: + """Combined document with all LEVEL3 constructs survives build+import.""" + import shutil + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + md = (CORPUS_DIR / "combined_document.md").read_text(encoding="utf-8") + manifest_path = _make_level3_project(tmp_path, md, name="combined") + manifest = load_manifest(manifest_path) + + build_result = build_document(manifest) + assert build_result.success + + import_result = import_document(manifest, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + + # Anchors preserved + assert "{#intro}" in reimported + + # Figures preserved (at least the label) + assert "fig:arch" in reimported + + # Citations preserved + assert "smith2020" in reimported + + +# --------------------------------------------------------------------------- +# CLI: markidocx test executes LEVEL1 + LEVEL3 corpus (non-regression gate) +# --------------------------------------------------------------------------- + + +def test_level1_regression_still_passes(tmp_path: Path) -> None: + """LEVEL1 round-trip must remain green after LEVEL3 changes (non-regression).""" + from tests.regression.test_roundtrip import LEVEL1_MARKDOWN + + (tmp_path / "doc.md").write_text(LEVEL1_MARKDOWN, encoding="utf-8") + manifest_path = tmp_path / "manifest.yaml" + manifest_path.write_text( + yaml.dump( + { + "project": {"name": "l1-nonreg", "feature_level": "level1", "family": "article"}, + "sources": [{"path": "doc.md"}], + "output": {"dir": "./dist"}, + } + ) + ) + (tmp_path / "dist").mkdir() + manifest = load_manifest(manifest_path) + + build_result = build_document(manifest) + assert build_result.success + assert not build_result.errors + + import_result = import_document(manifest, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + report = compare(LEVEL1_MARKDOWN, reimported) + broken_headings = [b for b in report.broken if b.startswith("heading:")] + assert not broken_headings diff --git a/tests/test_error_framework.py b/tests/test_error_framework.py new file mode 100644 index 0000000..3da9fef --- /dev/null +++ b/tests/test_error_framework.py @@ -0,0 +1,380 @@ +"""Tests for structured error & warning framework (FR-1201–1210).""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +# --------------------------------------------------------------------------- +# WarningRecord / FailureRecord / OutputState types (FR-1208–1210) +# --------------------------------------------------------------------------- + + +class TestWarningRecord: + def test_to_dict(self) -> None: + from markidocx.errors import Severity, WarningRecord + + w = WarningRecord(severity=Severity.WARNING, reason="unsupported-construct", construct="html_block") + d = w.to_dict() + assert d["severity"] == "warning" + assert d["reason"] == "unsupported-construct" + assert d["construct"] == "html_block" + + def test_str_with_construct(self) -> None: + from markidocx.errors import WarningRecord + + w = WarningRecord(severity="warning", reason="test-reason", construct="my-token") + assert "warning" in str(w) + assert "test-reason" in str(w) + assert "my-token" in str(w) + + def test_str_without_construct(self) -> None: + from markidocx.errors import WarningRecord + + w = WarningRecord(severity="info", reason="test-reason") + s = str(w) + assert "info" in s + assert "test-reason" in s + + +class TestFailureRecord: + def test_to_dict(self) -> None: + from markidocx.errors import FailureRecord, Severity + + f = FailureRecord(severity=Severity.ERROR, reason="docx-not-found", construct="some.docx") + d = f.to_dict() + assert d["severity"] == "error" + assert d["reason"] == "docx-not-found" + + +class TestOutputState: + def test_all_states_defined(self) -> None: + from markidocx.errors import OutputState + + assert OutputState.FINAL == "final" + assert OutputState.PARTIAL == "partial" + assert OutputState.FALLBACK == "fallback" + assert OutputState.DEGRADED == "degraded" + assert OutputState.UNRESOLVED == "unresolved" + + +# --------------------------------------------------------------------------- +# Builder emits WarningRecord for unsupported constructs (FR-1203, FR-1205) +# --------------------------------------------------------------------------- + + +class TestBuilderWarningRecords: + def test_unsupported_html_emits_warning_record(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.errors import Severity + from markidocx.manifest import load_manifest + + (tmp_path / "doc.md").write_text( + "# Hello\n\n
raw html
\n\nNormal paragraph.", + encoding="utf-8", + ) + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + assert len(result.warning_records) > 0 + html_warnings = [w for w in result.warning_records if "html" in w.construct] + assert html_warnings, "Expected warning for html construct" + assert all(w.severity == Severity.WARNING for w in html_warnings) + + def test_warning_records_have_reason(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + (tmp_path / "doc.md").write_text( + "# Hello\n\n
raw html
", + encoding="utf-8", + ) + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + for w in result.warning_records: + assert w.reason, "WarningRecord must have a non-empty reason" + + def test_warnings_property_returns_strings(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + (tmp_path / "doc.md").write_text("# Hello\n\n
html
", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert all(isinstance(w, str) for w in result.warnings) + + def test_output_state_on_clean_build(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.errors import OutputState + from markidocx.manifest import load_manifest + + (tmp_path / "doc.md").write_text("# Hello\n\nContent.", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: clean + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.output_state == OutputState.FINAL + + +# --------------------------------------------------------------------------- +# Importer emits WarningRecord for errors and fallback paths (FR-1206, FR-1207) +# --------------------------------------------------------------------------- + + +class TestImporterWarningRecords: + def test_not_found_emits_error_warning_record(self, tmp_path: Path) -> None: + from markidocx.errors import OutputState, Severity + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + (tmp_path / "doc.md").write_text("# Hello", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = import_document(m, tmp_path / "missing.docx") + assert not result.success + assert result.output_state == OutputState.UNRESOLVED + assert len(result.warning_records) > 0 + assert result.warning_records[0].severity == Severity.ERROR + assert result.warning_records[0].reason == "docx-not-found" + + def test_warnings_property_returns_strings(self, tmp_path: Path) -> None: + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + (tmp_path / "doc.md").write_text("# Hello", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = import_document(m, tmp_path / "missing.docx") + assert all(isinstance(w, str) for w in result.warnings) + + def test_fallback_emits_fallback_warning(self, tmp_path: Path) -> None: + """Multi-source import that can't redistribute produces fallback WarningRecord.""" + from markidocx.builder import build_document + from markidocx.errors import OutputState + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + # Create two source files — the DOCX will have a single H1 so redistribution fails + (tmp_path / "a.md").write_text("# Alpha\n\nContent.", encoding="utf-8") + (tmp_path / "b.md").write_text("# Beta\n\nContent.", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: multi + feature_level: level1 + family: article + sources: + - path: a.md + - path: b.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + + # Build first to get a DOCX + build_result = build_document(m) + assert build_result.success + + # Now import with a manifest that has 3 sources (mismatch) + (tmp_path / "c.md").write_text("# Gamma\n\nContent.", encoding="utf-8") + (tmp_path / "manifest3.yaml").write_text( + textwrap.dedent("""\ + project: + name: multi + feature_level: level1 + family: article + sources: + - path: a.md + - path: b.md + - path: c.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m3 = load_manifest(tmp_path / "manifest3.yaml") + result = import_document(m3, build_result.output_path) + assert result.success + assert result.mapping_status == "merged" + assert result.output_state == OutputState.FALLBACK + fallback_warnings = [w for w in result.warning_records if w.reason == "fallback"] + assert fallback_warnings, "Expected fallback WarningRecord" + + +# --------------------------------------------------------------------------- +# Differ output_state (FR-1204) +# --------------------------------------------------------------------------- + + +class TestDifferOutputState: + def test_final_state_on_clean_diff(self) -> None: + from markidocx.differ import compare + from markidocx.errors import OutputState + + text = "# Hello\n\nSome paragraph.\n\n- item one\n- item two" + report = compare(text, text) + assert not report.has_drift + assert report.output_state == OutputState.FINAL + + def test_degraded_state_on_degraded_diff(self) -> None: + from markidocx.differ import compare + from markidocx.errors import OutputState + + original = "# Hello\n\n- item one\n- item two\n- item three" + reimported = "# Hello\n\n- item one" + report = compare(original, reimported) + assert report.has_drift + assert report.output_state in (OutputState.DEGRADED, OutputState.PARTIAL) + + def test_partial_state_on_broken_diff(self) -> None: + from markidocx.differ import compare + from markidocx.errors import OutputState + + original = "# Section A\n\n## Sub\n\nParagraph." + reimported = "" + report = compare(original, reimported) + assert report.has_drift + assert report.output_state == OutputState.PARTIAL + + +# --------------------------------------------------------------------------- +# REST response envelope warnings are WarningRecord dicts (FR-1208) +# --------------------------------------------------------------------------- + + +class TestRestWarningRecords: + def test_build_warnings_are_dicts(self, tmp_path: Path) -> None: + """When build produces warnings, REST response warnings are dicts, not bare strings.""" + + from fastapi.testclient import TestClient + + from markidocx.rest import create_app + + manifest_yaml = textwrap.dedent("""\ + project: + name: test + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """) + # HTML in source will produce warnings + sources = [{"name": "doc.md", "content": "# Hello\n\n
html
"}] + client = TestClient(create_app()) + resp = client.post("/build", json={"manifest_yaml": manifest_yaml, "sources": sources}) + assert resp.status_code == 200 + body = resp.json() + warnings = body.get("warnings", []) + # Each warning should be a dict with severity/reason/construct keys + for w in warnings: + assert isinstance(w, dict), f"Expected dict warning, got {type(w)}: {w}" + assert "severity" in w + assert "reason" in w + + def test_import_warnings_are_dicts_on_failure(self) -> None: + """Import failure warns with WarningRecord dict, not bare string.""" + import base64 + + from fastapi.testclient import TestClient + + from markidocx.rest import create_app + + manifest_yaml = textwrap.dedent("""\ + project: + name: test + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """) + # Send an invalid (empty) DOCX + empty_docx = base64.b64encode(b"not-a-docx").decode() + client = TestClient(create_app()) + resp = client.post( + "/import", + json={"manifest_yaml": manifest_yaml, "docx_base64": empty_docx}, + ) + body = resp.json() + warnings = body.get("warnings", []) + for w in warnings: + assert isinstance(w, dict), f"Expected dict warning, got {type(w)}: {w}" diff --git a/tests/test_level3_bibliography.py b/tests/test_level3_bibliography.py new file mode 100644 index 0000000..7f2660b --- /dev/null +++ b/tests/test_level3_bibliography.py @@ -0,0 +1,349 @@ +"""Tests for LEVEL3 bibliography & citation support (FR-535, FR-536, FR-542).""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +LEVEL3_MANIFEST = textwrap.dedent("""\ + project: + name: bib-test + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist +""") + + +def _make_project(tmp_path: Path, markdown: str) -> Path: + (tmp_path / "doc.md").write_text(markdown, encoding="utf-8") + (tmp_path / "manifest.yaml").write_text(LEVEL3_MANIFEST, encoding="utf-8") + return tmp_path + + +# --------------------------------------------------------------------------- +# bibliography module helpers +# --------------------------------------------------------------------------- + + +class TestBibliographyHelpers: + def test_has_citations_true(self) -> None: + from markidocx.bibliography import has_citations + + assert has_citations("See [@smith2020] for details.") + + def test_has_citations_false(self) -> None: + from markidocx.bibliography import has_citations + + assert not has_citations("Normal paragraph without citations.") + + def test_extract_citation_keys(self) -> None: + from markidocx.bibliography import extract_citation_keys + + text = "See [@smith2020] and [@jones2021:chap] for more." + keys = extract_citation_keys(text) + assert "smith2020" in keys + assert "jones2021:chap" in keys + + def test_is_references_heading(self) -> None: + from markidocx.bibliography import is_references_heading + + assert is_references_heading("## References") + assert is_references_heading("# References") + assert is_references_heading("### References") + assert not is_references_heading("## Introduction") + + def test_parse_reference_entry(self) -> None: + from markidocx.bibliography import parse_reference_entry + + result = parse_reference_entry("- [@smith2020]: Smith, J. *Title*. 2020.") + assert result is not None + key, entry = result + assert key == "smith2020" + assert "Smith, J." in entry + + def test_extract_references_section(self) -> None: + from markidocx.bibliography import extract_references_section + + md = textwrap.dedent("""\ + # Document + + See [@smith2020]. + + ## References + + - [@smith2020]: Smith, J. *A Book*. 2020. + - [@jones2021]: Jones, B. *Another*. 2021. + """) + entries, text_without = extract_references_section(md) + assert len(entries) == 2 + assert entries[0][0] == "smith2020" + assert entries[1][0] == "jones2021" + assert "## References" not in text_without + + def test_render_citation_text_unchanged(self) -> None: + from markidocx.bibliography import render_citation_text + + text = "See [@smith2020] for details." + assert render_citation_text(text) == text + + +# --------------------------------------------------------------------------- +# Builder: citations and references section (FR-535) +# --------------------------------------------------------------------------- + + +class TestBuilderBibliography: + def test_build_with_citation_succeeds(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Document + + As shown by [@smith2020], the approach works. + + ## References + + - [@smith2020]: Smith, J. *A Work*. 2020. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + assert result.output_path.exists() + + def test_build_docx_contains_citation_marker(self, tmp_path: Path) -> None: + """The built DOCX should contain the citation text.""" + from docx import Document as DocxReader + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = "# Doc\n\nSee [@smith2020].\n\n## References\n\n- [@smith2020]: Smith. *T*. 2020." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + doc = DocxReader(str(result.output_path)) + texts = [p.text for p in doc.paragraphs] + citation_paras = [t for t in texts if "smith2020" in t] + assert citation_paras, f"No citation found in DOCX. Paragraphs: {texts}" + + def test_build_docx_contains_references_heading(self, tmp_path: Path) -> None: + """The built DOCX should have a References heading.""" + from docx import Document as DocxReader + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = "# Doc\n\nText.\n\n## References\n\n- [@k1]: Author. *T*. 2020." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + doc = DocxReader(str(result.output_path)) + texts = [p.text for p in doc.paragraphs] + assert "References" in texts, f"No References heading. Paragraphs: {texts}" + + def test_build_multi_citation_document(self, tmp_path: Path) -> None: + """Multiple citations and references entries all appear in DOCX.""" + from docx import Document as DocxReader + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Introduction + + According to [@smith2020] and [@jones2021], this is true. + + ## References + + - [@smith2020]: Smith, J. *Work A*. 2020. + - [@jones2021]: Jones, B. *Work B*. 2021. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + doc = DocxReader(str(result.output_path)) + all_text = " ".join(p.text for p in doc.paragraphs) + assert "smith2020" in all_text + assert "jones2021" in all_text + + +# --------------------------------------------------------------------------- +# Importer: citations and references restoration (FR-536) +# --------------------------------------------------------------------------- + + +class TestImporterBibliography: + def test_roundtrip_preserves_citation(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = "# Doc\n\nSee [@smith2020].\n\n## References\n\n- [@smith2020]: Smith. *T*. 2020." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "smith2020" in reimported + + def test_roundtrip_preserves_reference_entry(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Doc + + See [@k1]. + + ## References + + - [@k1]: Author. *Title*. 2020. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "k1" in reimported + + +# --------------------------------------------------------------------------- +# Differ: citation and bibliography comparison (FR-542) +# --------------------------------------------------------------------------- + + +class TestDifferBibliography: + def test_preserved_citation(self) -> None: + from markidocx.differ import compare + + text = "# Doc\n\nSee [@smith2020].\n\n## References\n\n- [@smith2020]: Smith. *T*. 2020." + report = compare(text, text) + assert any("citation:[@smith2020]" in p for p in report.preserved) + + def test_missing_citation_broken(self) -> None: + from markidocx.differ import compare + + original = "See [@smith2020]." + reimported = "See something." + report = compare(original, reimported) + assert any("citation:missing '[@smith2020]'" in b for b in report.broken) + assert report.has_drift + + def test_missing_reference_entry_degraded(self) -> None: + from markidocx.differ import compare + + original = textwrap.dedent("""\ + See [@k1]. + + ## References + + - [@k1]: Author. *T*. 2020. + """) + reimported = "See [@k1]." + report = compare(original, reimported) + assert any("reference-entry" in d for d in report.degraded) + + def test_unresolvable_citation_emits_warning(self) -> None: + """Missing citation in reimported emits citation-ambiguity warning.""" + from markidocx.bibliography import compare_citations + from markidocx.errors import WarningRecord + + original = "See [@missing]." + reimported = "See something." + preserved: list[str] = [] + degraded: list[str] = [] + broken: list[str] = [] + warning_records: list[WarningRecord] = [] + + compare_citations(original, reimported, preserved, degraded, broken, warning_records) + + ambiguity = [w for w in warning_records if w.reason == "citation-ambiguity"] + assert ambiguity, "Expected citation-ambiguity warning" + assert ambiguity[0].construct == "@missing" + + +# --------------------------------------------------------------------------- +# Single citation round-trip +# --------------------------------------------------------------------------- + + +class TestCitationRoundTrip: + def test_single_citation_roundtrip(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.differ import compare + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Introduction + + According to [@smith2020], things are good. + + ## References + + - [@smith2020]: Smith, J. *Good Stuff*. 2020. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + report = compare(md, reimported) + + broken_citations = [b for b in report.broken if "citation" in b] + assert not broken_citations, f"Broken citations: {broken_citations}" + + def test_multi_citation_document(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Paper + + First point from [@a2020]. Second from [@b2021]. + + ## References + + - [@a2020]: A. *Work A*. 2020. + - [@b2021]: B. *Work B*. 2021. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "a2020" in reimported + assert "b2021" in reimported diff --git a/tests/test_level3_diagrams.py b/tests/test_level3_diagrams.py new file mode 100644 index 0000000..432d043 --- /dev/null +++ b/tests/test_level3_diagrams.py @@ -0,0 +1,231 @@ +"""Tests for LEVEL3 auto-diagram support (FR-533, FR-534).""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +LEVEL3_MANIFEST = textwrap.dedent("""\ + project: + name: diag-test + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist +""") + + +def _make_project(tmp_path: Path, markdown: str) -> Path: + (tmp_path / "doc.md").write_text(markdown, encoding="utf-8") + (tmp_path / "manifest.yaml").write_text(LEVEL3_MANIFEST, encoding="utf-8") + return tmp_path + + +# --------------------------------------------------------------------------- +# diagrams module helpers +# --------------------------------------------------------------------------- + + +class TestDiagramHelpers: + def test_is_diagram_info_mermaid(self) -> None: + from markidocx.diagrams import is_diagram_info + + assert is_diagram_info("mermaid") + + def test_is_diagram_info_graphviz(self) -> None: + from markidocx.diagrams import is_diagram_info + + assert is_diagram_info("graphviz") + + def test_is_diagram_info_plantuml(self) -> None: + from markidocx.diagrams import is_diagram_info + + assert is_diagram_info("plantuml") + + def test_is_diagram_info_python_false(self) -> None: + from markidocx.diagrams import is_diagram_info + + assert not is_diagram_info("python") + assert not is_diagram_info("") + assert not is_diagram_info(None) + + def test_is_diagram_source_marker(self) -> None: + from markidocx.diagrams import is_diagram_source_marker + + assert is_diagram_source_marker("diagram-source:mermaid\ngraph TD\nA-->B") + assert not is_diagram_source_marker("normal text") + + def test_parse_diagram_source_marker(self) -> None: + from markidocx.diagrams import parse_diagram_source_marker + + source = "graph TD\nA-->B" + result = parse_diagram_source_marker(f"diagram-source:mermaid\n{source}") + assert result is not None + diagram_type, parsed_source = result + assert diagram_type == "mermaid" + assert parsed_source == source + + def test_reconstruct_diagram_md(self) -> None: + from markidocx.diagrams import reconstruct_diagram_md + + result = reconstruct_diagram_md("mermaid", "graph TD\nA-->B") + assert result.startswith("```mermaid") + assert "graph TD" in result + assert result.endswith("```") + + +# --------------------------------------------------------------------------- +# Builder: diagram blocks → source-only path (no renderer in test env) (FR-533) +# --------------------------------------------------------------------------- + + +class TestBuilderDiagrams: + def test_build_with_mermaid_block_succeeds(self, tmp_path: Path) -> None: + """Mermaid block builds without error (source-only path).""" + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Document + + ```mermaid + graph TD + A --> B --> C + ``` + + Some text. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + def test_build_emits_warning_for_unavailable_renderer( + self, tmp_path: Path, monkeypatch + ) -> None: + """Warns about missing diagram renderer (FR-538).""" + import shutil + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + md = "```mermaid\ngraph TD\nA-->B\n```" + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + dep_warnings = [ + w for w in result.warning_records + if w.reason == "processor-dependency-unavailable" + ] + assert dep_warnings + + def test_build_docx_contains_source_marker( + self, tmp_path: Path, monkeypatch + ) -> None: + """DOCX contains diagram-source marker for round-trip.""" + import shutil + + from docx import Document as DocxReader + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + md = "```mermaid\ngraph TD\nA-->B\n```" + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + doc = DocxReader(str(result.output_path)) + texts = [p.text for p in doc.paragraphs] + marker_texts = [t for t in texts if t.startswith("diagram-source:")] + assert marker_texts, f"No diagram-source marker found. Paragraphs: {texts}" + + def test_build_graphviz_block_succeeds(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = "```graphviz\ndigraph G { A -> B }\n```" + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + def test_non_diagram_code_block_not_warned( + self, tmp_path: Path + ) -> None: + """Python code blocks don't trigger diagram warnings.""" + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = "```python\nprint('hello')\n```" + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + dep_warnings = [ + w for w in result.warning_records + if w.reason == "processor-dependency-unavailable" + ] + # Only level3 diagram types trigger this warning, not python + # (may still warn for mmdc/dot if level3 partial check fires, but not for python block) + python_warnings = [w for w in dep_warnings if "python" in w.construct] + assert not python_warnings + + +# --------------------------------------------------------------------------- +# Importer: diagram source-intent marker → fenced block (FR-534) +# --------------------------------------------------------------------------- + + +class TestImporterDiagrams: + def test_roundtrip_source_only_path(self, tmp_path: Path, monkeypatch) -> None: + """Source-only round-trip: diagram source is preserved in reimported MD.""" + import shutil + + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + diagram_source = "graph TD\nA --> B --> C" + md = f"# Document\n\n```mermaid\n{diagram_source}\n```\n\nText." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "mermaid" in reimported + assert "graph TD" in reimported + + def test_no_source_discarded(self, tmp_path: Path, monkeypatch) -> None: + """Diagram source is never silently dropped (FR-1205).""" + import shutil + + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + md = "```plantuml\n@startuml\nAlice -> Bob: Hi\n@enduml\n```" + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + # Source content must be present somewhere in the reimported text + assert "plantuml" in reimported or "@startuml" in reimported diff --git a/tests/test_level3_figures.py b/tests/test_level3_figures.py new file mode 100644 index 0000000..eff82cf --- /dev/null +++ b/tests/test_level3_figures.py @@ -0,0 +1,342 @@ +"""Tests for LEVEL3 numbered figure support (FR-532, FR-541).""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +LEVEL3_MANIFEST = textwrap.dedent("""\ + project: + name: fig-test + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist +""") + + +def _make_project(tmp_path: Path, markdown: str) -> Path: + (tmp_path / "doc.md").write_text(markdown, encoding="utf-8") + (tmp_path / "manifest.yaml").write_text(LEVEL3_MANIFEST, encoding="utf-8") + return tmp_path + + +# --------------------------------------------------------------------------- +# figures module helpers +# --------------------------------------------------------------------------- + + +class TestFigureHelpers: + def test_is_figure_paragraph_true(self) -> None: + from markidocx.figures import is_figure_paragraph + + assert is_figure_paragraph("![My Caption](img/photo.png){#fig:photo}") + + def test_is_figure_paragraph_false(self) -> None: + from markidocx.figures import is_figure_paragraph + + assert not is_figure_paragraph("Normal paragraph text.") + assert not is_figure_paragraph("![alt](img.png)") # no {#fig:} label + + def test_parse_figure(self) -> None: + from markidocx.figures import parse_figure + + result = parse_figure("![Architecture Diagram](arch.png){#fig:arch}") + assert result is not None + caption, path, label = result + assert caption == "Architecture Diagram" + assert path == "arch.png" + assert label == "fig:arch" + + def test_extract_figures_from_md(self) -> None: + from markidocx.figures import extract_figures_from_md + + md = textwrap.dedent("""\ + # Title + + Some text. + + ![Figure One](fig1.png){#fig:f1} + + More text. + + ![Figure Two](fig2.png){#fig:f2} + """) + figs = extract_figures_from_md(md) + assert len(figs) == 2 + assert figs[0] == ("Figure One", "fig1.png", "fig:f1") + assert figs[1] == ("Figure Two", "fig2.png", "fig:f2") + + def test_extract_figure_labels(self) -> None: + from markidocx.figures import extract_figure_labels + + md = "![Cap1](a.png){#fig:f1}\n\n![Cap2](b.png){#fig:f2}" + labels = extract_figure_labels(md) + assert labels == {"fig:f1", "fig:f2"} + + def test_is_caption_paragraph(self) -> None: + from markidocx.figures import is_caption_paragraph + + assert is_caption_paragraph("Figure 1 — My Caption") + assert is_caption_paragraph("Figure 3 - Another Caption") + assert not is_caption_paragraph("Some normal text") + + def test_reconstruct_figure_md(self) -> None: + from markidocx.figures import reconstruct_figure_md + + result = reconstruct_figure_md("My Caption", "img/photo.png", "fig:photo") + assert result == "![My Caption](img/photo.png){#fig:photo}" + + +# --------------------------------------------------------------------------- +# Builder: figure declaration → DOCX caption paragraph (FR-532) +# --------------------------------------------------------------------------- + + +class TestBuilderFigures: + def test_build_with_figure_succeeds(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Document {#doc} + + Introduction. + + ![Architecture Diagram](arch.png){#fig:arch} + + More text. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + assert result.output_path.exists() + + def test_build_docx_contains_figure_caption(self, tmp_path: Path) -> None: + """The built DOCX should contain a caption paragraph with 'Figure 1'.""" + from docx import Document as DocxReader + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = "![My Diagram](diag.png){#fig:diag}\n\nSome text." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + doc = DocxReader(str(result.output_path)) + texts = [p.text for p in doc.paragraphs] + caption_paras = [t for t in texts if t.startswith("Figure 1")] + assert caption_paras, f"No 'Figure 1' caption found. Paragraphs: {texts}" + + def test_multiple_figures_numbered_sequentially(self, tmp_path: Path) -> None: + """Multiple figures get Figure 1, Figure 2, Figure 3.""" + from docx import Document as DocxReader + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Doc + + ![First](a.png){#fig:a} + + Some text. + + ![Second](b.png){#fig:b} + + More text. + + ![Third](c.png){#fig:c} + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + doc = DocxReader(str(result.output_path)) + texts = [p.text for p in doc.paragraphs] + assert any("Figure 1" in t for t in texts) + assert any("Figure 2" in t for t in texts) + assert any("Figure 3" in t for t in texts) + + def test_figure_not_activated_for_level1(self, tmp_path: Path) -> None: + """LEVEL1: figure syntax is not stripped (no caption paragraphs added).""" + from docx import Document as DocxReader + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + manifest_yaml = textwrap.dedent("""\ + project: + name: l1-fig + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """) + (tmp_path / "doc.md").write_text( + "# Title\n\n![My Diagram](diag.png){#fig:diag}", encoding="utf-8" + ) + (tmp_path / "manifest.yaml").write_text(manifest_yaml, encoding="utf-8") + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + doc = DocxReader(str(result.output_path)) + texts = [p.text for p in doc.paragraphs] + # No "Figure N" captions in LEVEL1 output + assert not any(t.startswith("Figure ") for t in texts) + + +# --------------------------------------------------------------------------- +# Importer: DOCX caption paragraphs → figure markdown (FR-532) +# --------------------------------------------------------------------------- + + +class TestImporterFigures: + def test_roundtrip_preserves_figure_caption(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = "# Title\n\n![Architecture](arch.png){#fig:arch}\n\nText." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "Architecture" in reimported + assert "fig:arch" in reimported + + def test_roundtrip_preserves_figure_label(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = "![Cap](img.png){#fig:myimg}\n\nText." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "{#fig:myimg}" in reimported + + +# --------------------------------------------------------------------------- +# Differ: figure identity coherence (FR-541) +# --------------------------------------------------------------------------- + + +class TestDifferFigures: + def test_preserved_figure_label(self) -> None: + from markidocx.differ import compare + + text = "# Title\n\n![Cap](img.png){#fig:img}\n\nText." + report = compare(text, text) + assert any("figure-label:fig:img" in p for p in report.preserved) + + def test_missing_figure_label_broken(self) -> None: + from markidocx.differ import compare + + original = "![Cap](img.png){#fig:img}\n\nText." + reimported = "Text." + report = compare(original, reimported) + assert any("figure-label:missing 'fig:img'" in b for b in report.broken) + assert report.has_drift + + def test_missing_caption_degraded(self) -> None: + from markidocx.differ import compare + + original = "![My Caption](img.png){#fig:img}" + reimported = "![Different Caption](img.png){#fig:img}" + report = compare(original, reimported) + assert any("figure-caption" in d for d in report.degraded) + + def test_preserved_caption(self) -> None: + from markidocx.differ import compare + + text = "![Same Caption](img.png){#fig:img}" + report = compare(text, text) + assert any("figure-caption" in p for p in report.preserved) + + +# --------------------------------------------------------------------------- +# Full figure round-trip +# --------------------------------------------------------------------------- + + +class TestFigureRoundTrip: + def test_single_figure_roundtrip(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.differ import compare + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Document + + Introduction. + + ![System Architecture](arch.png){#fig:arch} + + Conclusion. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + report = compare(md, reimported) + + # No broken figures + broken_figs = [b for b in report.broken if "figure" in b] + assert not broken_figs, f"Broken figures found: {broken_figs}" + + def test_multiple_figures_identity_coherent(self, tmp_path: Path) -> None: + """Multiple figures survive round-trip with correct labels.""" + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Doc + + ![Figure One Caption](fig1.png){#fig:f1} + + Text between figures. + + ![Figure Two Caption](fig2.png){#fig:f2} + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "{#fig:f1}" in reimported + assert "{#fig:f2}" in reimported diff --git a/tests/test_level3_plumbing.py b/tests/test_level3_plumbing.py new file mode 100644 index 0000000..3791281 --- /dev/null +++ b/tests/test_level3_plumbing.py @@ -0,0 +1,271 @@ +"""Tests for LEVEL3 plumbing — feature-level gating & disclosure (FR-537–539).""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +from markidocx.level3 import ( + Level3Support, + ProcessorDependency, + capabilities_entry, + check_level3_support, +) +from markidocx.manifest import FeatureLevel, load_manifest + +# --------------------------------------------------------------------------- +# Level3 support detection (FR-537, FR-538) +# --------------------------------------------------------------------------- + + +class TestCheckLevel3Support: + def test_returns_level3_support(self) -> None: + support = check_level3_support() + assert isinstance(support, Level3Support) + + def test_always_available(self) -> None: + support = check_level3_support() + assert support.available is True + + def test_dependencies_are_processor_dependency_instances(self) -> None: + support = check_level3_support() + for dep in support.dependencies: + assert isinstance(dep, ProcessorDependency) + assert dep.name in ("mmdc", "dot", "plantuml") + assert isinstance(dep.available, bool) + assert dep.description + + def test_partial_when_no_diagram_tools(self, monkeypatch) -> None: + """When no diagram tool is found, partial=True and missing_coverage is populated.""" + import shutil + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + support = check_level3_support() + assert support.partial is True + assert len(support.missing_coverage) > 0 + assert any("diagram" in m for m in support.missing_coverage) + + def test_not_partial_when_diagram_tool_present(self, monkeypatch) -> None: + """When at least one diagram tool is found, partial=False.""" + import shutil + + def fake_which(cmd: str) -> str | None: + return "/usr/bin/mmdc" if cmd == "mmdc" else None + + monkeypatch.setattr(shutil, "which", fake_which) + support = check_level3_support() + assert support.partial is False + assert support.missing_coverage == [] + + +# --------------------------------------------------------------------------- +# capabilities_entry (FR-537) +# --------------------------------------------------------------------------- + + +class TestCapabilitiesEntry: + def test_returns_dict_with_level(self) -> None: + entry = capabilities_entry() + assert entry["level"] == "level3" + + def test_available_is_true(self) -> None: + entry = capabilities_entry() + assert entry["available"] is True + + def test_has_dependencies_list(self) -> None: + entry = capabilities_entry() + assert isinstance(entry["dependencies"], list) + for dep in entry["dependencies"]: + assert "name" in dep + assert "available" in dep + assert "description" in dep + + def test_has_partial_and_missing_coverage(self) -> None: + entry = capabilities_entry() + assert "partial" in entry + assert "missing_coverage" in entry + + +# --------------------------------------------------------------------------- +# Manifest accepts feature_level: level3 (FR-537) +# --------------------------------------------------------------------------- + + +class TestManifestLevel3: + def test_level3_accepted(self, tmp_path: Path) -> None: + (tmp_path / "doc.md").write_text("# Hello", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + assert m.project.feature_level == FeatureLevel.LEVEL3 + + def test_level3_routes_to_level3_processing(self, tmp_path: Path) -> None: + """Building with feature_level: level3 succeeds (processing path reached).""" + from markidocx.builder import build_document + + (tmp_path / "doc.md").write_text("# Hello\n\nContent.", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test-l3 + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + assert result.feature_level == "level3" + + +# --------------------------------------------------------------------------- +# partial_level3 flag and processor-dependency disclosure (FR-538, FR-539) +# --------------------------------------------------------------------------- + + +class TestPartialLevel3Flag: + def test_partial_level3_set_when_no_diagram_tools( + self, tmp_path: Path, monkeypatch + ) -> None: + import shutil + + from markidocx.builder import build_document + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + (tmp_path / "doc.md").write_text("# Hello\n\nContent.", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test-partial + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + assert result.partial_level3 is True + assert len(result.missing_coverage) > 0 + + def test_partial_level3_false_for_level1(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + + (tmp_path / "doc.md").write_text("# Hello\n\nContent.", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: test-l1 + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.partial_level3 is False + assert result.missing_coverage == [] + + def test_dependency_warning_emitted_for_unavailable_tool( + self, tmp_path: Path, monkeypatch + ) -> None: + import shutil + + from markidocx.builder import build_document + from markidocx.errors import Severity + + monkeypatch.setattr(shutil, "which", lambda _cmd: None) + (tmp_path / "doc.md").write_text("# Hello", encoding="utf-8") + (tmp_path / "manifest.yaml").write_text( + textwrap.dedent("""\ + project: + name: t + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """), + encoding="utf-8", + ) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + dep_warnings = [ + w for w in result.warning_records + if w.reason == "processor-dependency-unavailable" + ] + assert dep_warnings, "Expected processor-dependency-unavailable warning" + assert all(w.severity == Severity.WARNING for w in dep_warnings) + + +# --------------------------------------------------------------------------- +# REST capabilities includes level3 (FR-537) +# --------------------------------------------------------------------------- + + +class TestRestCapabilitiesLevel3: + def test_capabilities_includes_level3(self) -> None: + from fastapi.testclient import TestClient + + from markidocx.rest import create_app + + client = TestClient(create_app()) + resp = client.get("/capabilities") + assert resp.status_code == 200 + body = resp.json() + outputs = body["outputs"] + assert "level3" in outputs + assert outputs["level3"]["level"] == "level3" + assert outputs["level3"]["available"] is True + assert "dependencies" in outputs["level3"] + + +# --------------------------------------------------------------------------- +# MCP validate_project includes level3 in context (FR-537) +# --------------------------------------------------------------------------- + + +class TestMcpLevel3: + def test_validate_project_includes_level3(self) -> None: + from markidocx.mcp_server import validate_project + + manifest_yaml = textwrap.dedent("""\ + project: + name: test + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """) + result = validate_project(manifest_yaml) + assert result["status"] == "ok" + assert result["feature_level"] == "level3" + assert "level3" in result["context"] + assert result["context"]["level3"]["available"] is True diff --git a/tests/test_level3_xref.py b/tests/test_level3_xref.py new file mode 100644 index 0000000..a322f42 --- /dev/null +++ b/tests/test_level3_xref.py @@ -0,0 +1,326 @@ +"""Tests for LEVEL3 cross-reference support (FR-531, FR-540).""" + +from __future__ import annotations + +import textwrap +from pathlib import Path + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +LEVEL3_MANIFEST = textwrap.dedent("""\ + project: + name: xref-test + feature_level: level3 + family: article + sources: + - path: doc.md + output: + dir: ./dist +""") + + +def _make_project(tmp_path: Path, markdown: str, manifest_yaml: str = LEVEL3_MANIFEST) -> Path: + (tmp_path / "doc.md").write_text(markdown, encoding="utf-8") + (tmp_path / "manifest.yaml").write_text(manifest_yaml, encoding="utf-8") + return tmp_path + + +# --------------------------------------------------------------------------- +# xref module helpers +# --------------------------------------------------------------------------- + + +class TestXrefHelpers: + def test_extract_anchor_from_heading_plain(self) -> None: + from markidocx.xref import extract_anchor_from_heading + + clean, anchor = extract_anchor_from_heading("Introduction {#intro}") + assert clean == "Introduction" + assert anchor == "intro" + + def test_extract_anchor_from_heading_no_anchor(self) -> None: + from markidocx.xref import extract_anchor_from_heading + + clean, anchor = extract_anchor_from_heading("Introduction") + assert clean == "Introduction" + assert anchor is None + + def test_extract_anchors_from_text(self) -> None: + from markidocx.xref import extract_anchors + + text = "# Section {#sec1}\n\n## Subsection {#sec2}\n\nNormal." + anchors = extract_anchors(text) + assert anchors == {"sec1", "sec2"} + + def test_extract_xref_links(self) -> None: + from markidocx.xref import extract_xref_links + + text = "See [Section One][sec1] and [Section Two][sec2]." + links = extract_xref_links(text) + assert ("Section One", "sec1") in links + assert ("Section Two", "sec2") in links + + def test_has_xref_links_true(self) -> None: + from markidocx.xref import has_xref_links + + assert has_xref_links("See [Intro][intro] for details.") + + def test_has_xref_links_false(self) -> None: + from markidocx.xref import has_xref_links + + assert not has_xref_links("Normal paragraph without refs.") + + +# --------------------------------------------------------------------------- +# Builder: headings with anchors → DOCX bookmarks (FR-531) +# --------------------------------------------------------------------------- + + +class TestBuilderXref: + def test_build_with_anchor_succeeds(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = "# Introduction {#intro}\n\nSome text.\n\n## Section One {#sec1}\n\nContent." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + assert result.output_path.exists() + + def test_build_docx_contains_bookmark(self, tmp_path: Path) -> None: + """The built DOCX XML should contain a bookmarkStart for {#intro}.""" + from docx import Document as DocxReader + + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = "# Introduction {#intro}\n\nContent." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + + doc = DocxReader(str(result.output_path)) + _W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + bookmarks = [ + elem.get(f"{{{_W}}}name") + for elem in doc.element.body.iter(f"{{{_W}}}bookmarkStart") + if elem.get(f"{{{_W}}}name") and not elem.get(f"{{{_W}}}name", "").startswith("_") + ] + assert "intro" in bookmarks + + def test_build_with_cross_ref_link(self, tmp_path: Path) -> None: + """Cross-ref links [text][anchor] render without errors.""" + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Introduction {#intro} + + Some text. + + # Methodology {#method} + + See [Introduction][intro] for background. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + assert result.output_path.exists() + + def test_build_xref_not_activated_for_level1(self, tmp_path: Path) -> None: + """Level1 build: {#anchor} syntax is treated as literal heading text.""" + from markidocx.builder import build_document + from markidocx.manifest import load_manifest + + manifest_yaml = textwrap.dedent("""\ + project: + name: l1-test + feature_level: level1 + family: article + sources: + - path: doc.md + output: + dir: ./dist + """) + # In LEVEL1, {#anchor} is not stripped and no bookmark is added + md = "# Introduction {#intro}\n\nContent." + _make_project(tmp_path, md, manifest_yaml) + m = load_manifest(tmp_path / "manifest.yaml") + result = build_document(m) + assert result.success + # No cross-ref warnings + xref_warnings = [w for w in result.warning_records if "xref" in w.reason.lower()] + assert not xref_warnings + + +# --------------------------------------------------------------------------- +# Importer: DOCX bookmarks → {#anchor} labels (FR-531) +# --------------------------------------------------------------------------- + + +class TestImporterXref: + def test_roundtrip_preserves_anchor(self, tmp_path: Path) -> None: + """Build LEVEL3 doc with {#anchor}, import back → heading has {#anchor}.""" + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = "# Introduction {#intro}\n\nSome text." + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "{#intro}" in reimported + + def test_roundtrip_preserves_cross_ref_link(self, tmp_path: Path) -> None: + """Cross-ref link [text][anchor] survives a round trip.""" + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Introduction {#intro} + + Some intro text. + + # Methodology {#method} + + See [Introduction][intro] for background. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + assert "{#intro}" in reimported + assert "[Introduction][intro]" in reimported + + +# --------------------------------------------------------------------------- +# Differ: cross-ref detection (FR-540) +# --------------------------------------------------------------------------- + + +class TestDifferXref: + def test_preserved_anchor_reported(self) -> None: + from markidocx.differ import compare + + original = "# Introduction {#intro}\n\nText." + reimported = "# Introduction {#intro}\n\nText." + report = compare(original, reimported) + assert any("xref-anchor:intro" in p for p in report.preserved) + assert not any("xref-anchor" in b for b in report.broken) + + def test_missing_anchor_reported_as_broken(self) -> None: + from markidocx.differ import compare + + original = "# Introduction {#intro}\n\nText." + reimported = "# Introduction\n\nText." + report = compare(original, reimported) + assert any("xref-anchor:missing 'intro'" in b for b in report.broken) + assert report.has_drift + + def test_preserved_xref_link(self) -> None: + from markidocx.differ import compare + + text = "# Intro {#intro}\n\nSee [Intro][intro]." + report = compare(text, text) + assert any("xref-link" in p for p in report.preserved) + + def test_broken_xref_link_target_missing(self) -> None: + from markidocx.differ import compare + + original = "# Intro {#intro}\n\nSee [Intro][intro]." + reimported = "# Intro\n\nSee something." + report = compare(original, reimported) + # anchor missing → broken xref link + broken_xref = [b for b in report.broken if "xref" in b] + assert broken_xref + + +# --------------------------------------------------------------------------- +# Full single-file xref round-trip +# --------------------------------------------------------------------------- + + +class TestXrefRoundTrip: + def test_single_file_xref_roundtrip(self, tmp_path: Path) -> None: + from markidocx.builder import build_document + from markidocx.differ import compare + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Introduction {#intro} + + Welcome. + + # Background {#bg} + + See [Introduction][intro] and [Background][bg]. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + report = compare(md, reimported) + + # No broken cross-refs + broken_xrefs = [b for b in report.broken if "xref" in b] + assert not broken_xrefs, f"Broken xrefs found: {broken_xrefs}" + + def test_multi_ref_document(self, tmp_path: Path) -> None: + """Document with multiple anchors and refs doesn't produce broken xrefs.""" + from markidocx.builder import build_document + from markidocx.importer import import_document + from markidocx.manifest import load_manifest + + md = textwrap.dedent("""\ + # Chapter One {#ch1} + + Opening. + + # Chapter Two {#ch2} + + See [Chapter One][ch1]. + + # Chapter Three {#ch3} + + Refers to [Chapter One][ch1] and [Chapter Two][ch2]. + """) + _make_project(tmp_path, md) + m = load_manifest(tmp_path / "manifest.yaml") + + build_result = build_document(m) + assert build_result.success + + import_result = import_document(m, build_result.output_path) + assert import_result.success + + reimported = import_result.output_files[0].read_text(encoding="utf-8") + # All three anchors should be in reimported + assert "{#ch1}" in reimported + assert "{#ch2}" in reimported + assert "{#ch3}" in reimported diff --git a/workplans/MRKD-WP-0003-level3-advanced-features.md b/workplans/MRKD-WP-0003-level3-advanced-features.md index 8619675..d87c08a 100644 --- a/workplans/MRKD-WP-0003-level3-advanced-features.md +++ b/workplans/MRKD-WP-0003-level3-advanced-features.md @@ -3,7 +3,7 @@ id: MRKD-WP-0003 type: workplan domain: markitect repo: marki-docx -status: active +status: done state_hub_workstream_id: b04fe706-6e4e-48a8-b6c1-194d9e308215 created: 2026-03-17 updated: 2026-03-17 @@ -28,7 +28,7 @@ through the existing interface layer once the core modules support them. ```task id: MRKD-WP-0003-T01 -status: todo +status: done priority: high state_hub_task_id: 51e1b53e-a62f-496b-892d-615513c35d67 ``` @@ -53,7 +53,7 @@ Deliverable: `pytest tests/test_level3_plumbing.py` passes; level3 appears in ca ```task id: MRKD-WP-0003-T02 -status: todo +status: done priority: high state_hub_task_id: f4010618-9d35-4c04-bc1c-c599f254edff ``` @@ -81,7 +81,7 @@ Deliverable: `pytest tests/test_error_framework.py` passes; all modules emit str ```task id: MRKD-WP-0003-T03 -status: todo +status: done priority: high state_hub_task_id: 0bb9c7ce-5eb8-4997-833f-c801e37f282c ``` @@ -104,7 +104,7 @@ Deliverable: `pytest tests/test_level3_xref.py` passes. ```task id: MRKD-WP-0003-T04 -status: todo +status: done priority: high state_hub_task_id: af6b82b7-da44-4ef8-8976-6e40fee5f73c ``` @@ -128,7 +128,7 @@ Deliverable: `pytest tests/test_level3_figures.py` passes. ```task id: MRKD-WP-0003-T05 -status: todo +status: done priority: medium state_hub_task_id: 3700e0e4-cc3b-4ef3-8b85-6cef24c35fc0 ``` @@ -151,7 +151,7 @@ Deliverable: `pytest tests/test_level3_diagrams.py` passes. ```task id: MRKD-WP-0003-T06 -status: todo +status: done priority: medium state_hub_task_id: 7c0acbd3-65f0-440b-9ad4-a5f09fabef3c ``` @@ -175,7 +175,7 @@ Deliverable: `pytest tests/test_level3_bibliography.py` passes. ```task id: MRKD-WP-0003-T07 -status: todo +status: done priority: medium state_hub_task_id: b26241b9-0fff-45a2-a95c-dd886a449038 ```