generated from coulomb/repo-seed
T01: markidocx inspect (FR-806) and markidocx test (FR-810) CLI commands
T02: markidocx evidence get/list CLI commands (FR-1409, FR-814)
T03: list_styles() / GET /styles / MCP list_styles with real style data (FR-907)
T04: Evidence assembly — EvidenceSet summary via REST and MCP (FR-1406–1408)
T05: LEVEL3 edge-case tests — diagram mutation, renderer version check,
bibliography duplicate keys / missing refs / special chars (FR-534, FR-538, FR-542)
T06: markidocx template extract + Word-first round-trip regression test (FR-606)
New: differ._compare_diagram_blocks tracks fenced diagram source drift (FR-534)
New: diagrams.check_renderer_version emits warning for outdated renderers (FR-538)
New: bibliography.validate_citations detects duplicate keys and missing entries (FR-542)
New: templates.extract_template / TemplateExtractionResult / list_styles / StyleEntry
New: REST POST /template/extract; MCP extract_template tool
278 tests pass, ruff+mypy clean.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
243 lines
8.1 KiB
Python
243 lines
8.1 KiB
Python
"""Structural drift detection for markidocx (FR-700)."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
|
|
from markidocx.errors import OutputState
|
|
|
|
HEADING_RE = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
|
LIST_ITEM_RE = re.compile(r"^(\s*[-*+]|\s*\d+\.)\s+(.+)$", re.MULTILINE)
|
|
TABLE_ROW_RE = re.compile(r"^\|.+\|$", re.MULTILINE)
|
|
FOOTNOTE_RE = re.compile(r"\[\^[^\]]+\]")
|
|
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")
|
|
|
|
|
|
@dataclass
|
|
class DriftReport:
|
|
has_drift: bool
|
|
preserved: list[str] = field(default_factory=list)
|
|
degraded: list[str] = field(default_factory=list)
|
|
broken: list[str] = field(default_factory=list)
|
|
unsupported: list[str] = field(default_factory=list)
|
|
output_state: OutputState = OutputState.FINAL
|
|
|
|
|
|
def compare(original: str, reimported: str) -> DriftReport:
|
|
"""Compare *original* Markdown against *reimported* Markdown.
|
|
|
|
Classifies each structural element as:
|
|
- preserved: identical in both
|
|
- degraded: present but modified
|
|
- broken: present in original, missing in reimported
|
|
- unsupported: construct not supported by the round-trip
|
|
|
|
Returns a DriftReport.
|
|
"""
|
|
preserved: list[str] = []
|
|
degraded: list[str] = []
|
|
broken: list[str] = []
|
|
unsupported: list[str] = []
|
|
|
|
# --- Headings (FR-501) ---
|
|
orig_headings = _extract_headings(original)
|
|
reim_headings = _extract_headings(reimported)
|
|
_compare_sets("heading", orig_headings, reim_headings, preserved, degraded, broken)
|
|
|
|
# --- Lists (FR-502) ---
|
|
orig_lists = _extract_list_items(original)
|
|
reim_lists = _extract_list_items(reimported)
|
|
_compare_sets("list_item", orig_lists, reim_lists, preserved, degraded, broken)
|
|
|
|
# --- Tables (FR-503) ---
|
|
orig_tables = _count_tables(original)
|
|
reim_tables = _count_tables(reimported)
|
|
if orig_tables == reim_tables:
|
|
if orig_tables > 0:
|
|
preserved.append(f"tables:{orig_tables}")
|
|
elif reim_tables < orig_tables:
|
|
broken.append(f"tables:missing {orig_tables - reim_tables} of {orig_tables}")
|
|
else:
|
|
degraded.append(f"tables:count changed {orig_tables}→{reim_tables}")
|
|
|
|
# --- Footnotes (FR-504) ---
|
|
orig_fn = set(FOOTNOTE_RE.findall(original))
|
|
reim_fn = set(FOOTNOTE_RE.findall(reimported))
|
|
for fn in orig_fn:
|
|
if fn in reim_fn:
|
|
preserved.append(f"footnote:{fn}")
|
|
else:
|
|
broken.append(f"footnote:{fn}")
|
|
|
|
# --- Links (FR-506) ---
|
|
orig_links = {m.group(0) for m in LINK_RE.finditer(original)}
|
|
reim_links = {m.group(0) for m in LINK_RE.finditer(reimported)}
|
|
for link in orig_links:
|
|
if link in reim_links:
|
|
preserved.append(f"link:{link[:40]}")
|
|
else:
|
|
degraded.append(f"link:lost {link[:40]}")
|
|
|
|
# --- Cross-references (FR-531, FR-540) ---
|
|
_compare_xrefs(original, reimported, preserved, degraded, broken)
|
|
|
|
# --- Figures (FR-532, FR-541) ---
|
|
_compare_figures(original, reimported, preserved, degraded, broken)
|
|
|
|
# --- Diagram source blocks (FR-534) ---
|
|
_compare_diagram_blocks(original, reimported, preserved, degraded, broken)
|
|
|
|
# --- Citations & Bibliography (FR-535, FR-542) ---
|
|
from markidocx.bibliography import compare_citations
|
|
|
|
compare_citations(original, reimported, preserved, degraded, broken)
|
|
|
|
has_drift = bool(degraded or broken)
|
|
output_state = (
|
|
OutputState.FINAL if not has_drift
|
|
else (OutputState.DEGRADED if not broken else OutputState.PARTIAL)
|
|
)
|
|
return DriftReport(
|
|
has_drift=has_drift,
|
|
preserved=preserved,
|
|
degraded=degraded,
|
|
broken=broken,
|
|
unsupported=unsupported,
|
|
output_state=output_state,
|
|
)
|
|
|
|
|
|
def _extract_headings(text: str) -> list[str]:
|
|
return [f"{'#' * len(m.group(1))} {m.group(2).strip()}" for m in HEADING_RE.finditer(text)]
|
|
|
|
|
|
def _extract_list_items(text: str) -> list[str]:
|
|
return [m.group(2).strip() for m in LIST_ITEM_RE.finditer(text)]
|
|
|
|
|
|
def _count_tables(text: str) -> int:
|
|
rows = TABLE_ROW_RE.findall(text)
|
|
if not rows:
|
|
return 0
|
|
# Count separator rows as table boundaries
|
|
sep_re = re.compile(r"^\|[-| :]+\|$")
|
|
count = sum(1 for r in rows if sep_re.match(r))
|
|
return count
|
|
|
|
|
|
def _compare_figures(
|
|
original: str,
|
|
reimported: str,
|
|
preserved: list[str],
|
|
degraded: list[str],
|
|
broken: list[str],
|
|
) -> None:
|
|
"""Compare figure labels and captions (FR-532, FR-541)."""
|
|
from markidocx.figures import extract_figure_captions, extract_figure_labels
|
|
|
|
orig_labels = extract_figure_labels(original)
|
|
reim_labels = extract_figure_labels(reimported)
|
|
for label in orig_labels:
|
|
if label in reim_labels:
|
|
preserved.append(f"figure-label:{label}")
|
|
else:
|
|
broken.append(f"figure-label:missing '{label}'")
|
|
|
|
orig_captions = extract_figure_captions(original)
|
|
reim_captions = extract_figure_captions(reimported)
|
|
orig_set = set(orig_captions)
|
|
reim_set = set(reim_captions)
|
|
for caption in orig_set:
|
|
if caption in reim_set:
|
|
preserved.append(f"figure-caption:{caption[:40]}")
|
|
else:
|
|
degraded.append(f"figure-caption:lost '{caption[:40]}'")
|
|
|
|
|
|
def _compare_xrefs(
|
|
original: str,
|
|
reimported: str,
|
|
preserved: list[str],
|
|
degraded: list[str],
|
|
broken: list[str],
|
|
) -> None:
|
|
"""Compare cross-reference anchors and links (FR-531, FR-540)."""
|
|
from markidocx.xref import extract_anchors, extract_xref_links
|
|
|
|
orig_anchors = extract_anchors(original)
|
|
reim_anchors = extract_anchors(reimported)
|
|
for anchor in orig_anchors:
|
|
if anchor in reim_anchors:
|
|
preserved.append(f"xref-anchor:{anchor}")
|
|
else:
|
|
broken.append(f"xref-anchor:missing '{anchor}'")
|
|
|
|
orig_xrefs = extract_xref_links(original)
|
|
reim_xrefs = extract_xref_links(reimported)
|
|
for link_text, anchor in orig_xrefs:
|
|
if (link_text, anchor) in reim_xrefs:
|
|
preserved.append(f"xref-link:[{link_text}][{anchor}]")
|
|
elif anchor not in reim_anchors:
|
|
broken.append(f"xref-link:broken-target [{link_text}][{anchor}]")
|
|
else:
|
|
degraded.append(f"xref-link:degraded [{link_text}][{anchor}]")
|
|
|
|
|
|
_FENCED_BLOCK_RE = re.compile(r"```(\w+)\n(.*?)```", re.DOTALL)
|
|
|
|
|
|
def _extract_fenced_blocks(text: str) -> list[tuple[str, str]]:
|
|
"""Extract all fenced code blocks as (language, source) pairs."""
|
|
return [(m.group(1).strip().lower(), m.group(2).rstrip()) for m in _FENCED_BLOCK_RE.finditer(text)]
|
|
|
|
|
|
def _compare_diagram_blocks(
|
|
original: str,
|
|
reimported: str,
|
|
preserved: list[str],
|
|
degraded: list[str],
|
|
broken: list[str],
|
|
) -> None:
|
|
"""Compare diagram fenced blocks for source-content drift (FR-534)."""
|
|
from markidocx.diagrams import DIAGRAM_TYPES
|
|
|
|
orig_blocks = [(lang, src) for lang, src in _extract_fenced_blocks(original) if lang in DIAGRAM_TYPES]
|
|
reim_blocks = [(lang, src) for lang, src in _extract_fenced_blocks(reimported) if lang in DIAGRAM_TYPES]
|
|
|
|
for i, (lang, src) in enumerate(orig_blocks):
|
|
if i < len(reim_blocks):
|
|
reim_lang, reim_src = reim_blocks[i]
|
|
if lang == reim_lang and src == reim_src:
|
|
preserved.append(f"diagram:{lang}[{i}]")
|
|
else:
|
|
degraded.append(f"diagram:{lang}[{i}]:source-mutated")
|
|
else:
|
|
broken.append(f"diagram:{lang}[{i}]:missing")
|
|
|
|
|
|
def _compare_sets(
|
|
kind: str,
|
|
orig: list[str],
|
|
reim: list[str],
|
|
preserved: list[str],
|
|
degraded: list[str],
|
|
broken: list[str],
|
|
) -> None:
|
|
orig_counts: dict[str, int] = {}
|
|
for item in orig:
|
|
orig_counts[item] = orig_counts.get(item, 0) + 1
|
|
|
|
reim_counts: dict[str, int] = {}
|
|
for item in reim:
|
|
reim_counts[item] = reim_counts.get(item, 0) + 1
|
|
|
|
for item, count in orig_counts.items():
|
|
reim_count = reim_counts.get(item, 0)
|
|
if reim_count >= count:
|
|
preserved.append(f"{kind}:{item[:60]}")
|
|
elif reim_count > 0:
|
|
degraded.append(f"{kind}:partial '{item[:60]}' ({reim_count}/{count})")
|
|
else:
|
|
broken.append(f"{kind}:missing '{item[:60]}'")
|