Files
marki-docx/tests/regression/test_word_first_roundtrip.py
Bernd Worsch 9fe64bcd7f feat: WP-0007 — Interface Completeness & Evidence
T01: markidocx inspect (FR-806) and markidocx test (FR-810) CLI commands
T02: markidocx evidence get/list CLI commands (FR-1409, FR-814)
T03: list_styles() / GET /styles / MCP list_styles with real style data (FR-907)
T04: Evidence assembly — EvidenceSet summary via REST and MCP (FR-1406–1408)
T05: LEVEL3 edge-case tests — diagram mutation, renderer version check,
     bibliography duplicate keys / missing refs / special chars (FR-534, FR-538, FR-542)
T06: markidocx template extract + Word-first round-trip regression test (FR-606)

New: differ._compare_diagram_blocks tracks fenced diagram source drift (FR-534)
New: diagrams.check_renderer_version emits warning for outdated renderers (FR-538)
New: bibliography.validate_citations detects duplicate keys and missing entries (FR-542)
New: templates.extract_template / TemplateExtractionResult / list_styles / StyleEntry
New: REST POST /template/extract; MCP extract_template tool

278 tests pass, ruff+mypy clean.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-17 19:30:09 +00:00

237 lines
8.8 KiB
Python

"""T06 — End-to-end Word-first round-trip: template extraction and rebuild verification."""
from __future__ import annotations
import textwrap
from pathlib import Path
import pytest
FIXTURE_DOCX = Path(__file__).parent / "fixtures" / "word_first" / "source.docx"
@pytest.fixture(scope="module")
def source_docx() -> Path:
"""Return path to the committed source.docx fixture."""
if not FIXTURE_DOCX.exists():
pytest.skip(
f"Fixture not found: {FIXTURE_DOCX}. "
"Run tests/regression/fixtures/word_first/generate.py to create it."
)
return FIXTURE_DOCX
# ---------------------------------------------------------------------------
# Template extraction tests
# ---------------------------------------------------------------------------
class TestTemplateExtraction:
def test_extract_produces_template_file(self, source_docx: Path, tmp_path: Path) -> None:
from markidocx.templates import extract_template
template_out = tmp_path / "template.docx"
result = extract_template(source_docx, template_out)
assert template_out.exists()
assert template_out.stat().st_size > 0
assert result.template_path == template_out
def test_extracted_template_has_zero_body_paragraphs(
self, source_docx: Path, tmp_path: Path
) -> None:
from docx import Document
from markidocx.templates import extract_template
template_out = tmp_path / "template.docx"
extract_template(source_docx, template_out)
doc = Document(str(template_out))
# Only one empty paragraph (the one we insert for validity)
non_empty = [p for p in doc.paragraphs if p.text.strip()]
assert non_empty == [], f"Expected no content paragraphs, found: {non_empty}"
def test_extracted_template_preserves_styles(
self, source_docx: Path, tmp_path: Path
) -> None:
from docx import Document
from markidocx.templates import extract_template
template_out = tmp_path / "template.docx"
result = extract_template(source_docx, template_out)
# The style count should be preserved
assert result.styles_preserved > 0
# Verify styles are actually in the output
source_doc = Document(str(source_docx))
template_doc = Document(str(template_out))
source_styles = {s.name for s in source_doc.styles}
template_styles = {s.name for s in template_doc.styles}
assert source_styles == template_styles
def test_extract_styles_preserved_count(
self, source_docx: Path, tmp_path: Path
) -> None:
from docx import Document
from markidocx.templates import extract_template
template_out = tmp_path / "template.docx"
result = extract_template(source_docx, template_out)
source_doc = Document(str(source_docx))
assert result.styles_preserved == len(list(source_doc.styles))
def test_extraction_idempotent(self, source_docx: Path, tmp_path: Path) -> None:
"""Extracting an already-empty template is a no-op (same style set)."""
from docx import Document
from markidocx.templates import extract_template
template_a = tmp_path / "template_a.docx"
template_b = tmp_path / "template_b.docx"
extract_template(source_docx, template_a)
extract_template(template_a, template_b)
doc_a = Document(str(template_a))
doc_b = Document(str(template_b))
styles_a = {s.name for s in doc_a.styles}
styles_b = {s.name for s in doc_b.styles}
assert styles_a == styles_b
# ---------------------------------------------------------------------------
# Content extraction via import
# ---------------------------------------------------------------------------
class TestContentExtraction:
def test_import_extracts_headings(self, source_docx: Path, tmp_path: Path) -> None:
"""Importing the fixture DOCX produces Markdown with expected headings."""
from markidocx.importer import import_document
from markidocx.manifest import load_manifest
manifest_text = textwrap.dedent("""\
project:
name: "word-first-test"
feature_level: level1
family: article
sources:
- path: doc.md
output:
dir: ./dist
""")
(tmp_path / "doc.md").write_text("", encoding="utf-8")
(tmp_path / "manifest.yaml").write_text(manifest_text, encoding="utf-8")
(tmp_path / "dist").mkdir()
m = load_manifest(tmp_path / "manifest.yaml")
result = import_document(m, source_docx)
assert result.success
content_md = result.output_files[0].read_text(encoding="utf-8")
assert "Introduction" in content_md or "introduction" in content_md.lower()
# ---------------------------------------------------------------------------
# Full word-first round-trip
# ---------------------------------------------------------------------------
class TestWordFirstRoundTrip:
def test_word_first_roundtrip(self, source_docx: Path, tmp_path: Path) -> None:
"""Full round-trip: extract template + content, rebuild, reimport, check zero structural drift."""
from docx import Document
from markidocx.builder import build_document
from markidocx.differ import compare
from markidocx.importer import import_document
from markidocx.manifest import load_manifest
from markidocx.templates import extract_template
# Step 1: extract template
template_out = tmp_path / "template.docx"
extract_template(source_docx, template_out)
assert template_out.exists()
# Step 2: assert template has zero body content paragraphs
template_doc = Document(str(template_out))
non_empty = [p for p in template_doc.paragraphs if p.text.strip()]
assert non_empty == []
# Step 3: import source to get content.md
import_manifest_text = textwrap.dedent("""\
project:
name: "word-first-import"
feature_level: level1
family: article
sources:
- path: content.md
output:
dir: ./dist
""")
import_dir = tmp_path / "import_step"
import_dir.mkdir()
(import_dir / "content.md").write_text("", encoding="utf-8")
(import_dir / "manifest.yaml").write_text(import_manifest_text, encoding="utf-8")
(import_dir / "dist").mkdir()
m_import = load_manifest(import_dir / "manifest.yaml")
import_result = import_document(m_import, source_docx)
assert import_result.success
content_md = import_result.output_files[0].read_text(encoding="utf-8")
assert content_md.strip(), "Extracted content must be non-empty"
# Step 4: assert content.md has expected headings
assert "Introduction" in content_md or "Heading" in content_md or "#" in content_md
# Step 5: build with content + template → rebuilt.docx
build_dir = tmp_path / "build_step"
build_dir.mkdir()
(build_dir / "content.md").write_text(content_md, encoding="utf-8")
build_manifest_text = textwrap.dedent("""\
project:
name: "word-first-build"
feature_level: level1
family: article
sources:
- path: content.md
output:
dir: ./dist
""")
(build_dir / "manifest.yaml").write_text(build_manifest_text, encoding="utf-8")
(build_dir / "dist").mkdir()
m_build = load_manifest(build_dir / "manifest.yaml")
build_result = build_document(m_build)
assert build_result.success
rebuilt_docx = Path(build_result.output_path)
# Step 6: reimport rebuilt.docx → reimported.md
reimport_dir = tmp_path / "reimport_step"
reimport_dir.mkdir()
(reimport_dir / "content.md").write_text("", encoding="utf-8")
(reimport_dir / "manifest.yaml").write_text(import_manifest_text, encoding="utf-8")
(reimport_dir / "dist").mkdir()
m_reimport = load_manifest(reimport_dir / "manifest.yaml")
reimport_result = import_document(m_reimport, rebuilt_docx)
assert reimport_result.success
reimported_md = reimport_result.output_files[0].read_text(encoding="utf-8")
# Step 7: assert no heading or citation structural drift
report = compare(content_md, reimported_md)
# Only fail on heading-level drift (not table/misc artefacts from fixture)
heading_broken = [b for b in report.broken if b.startswith("heading:")]
citation_broken = [b for b in report.broken if b.startswith("citation:")]
assert not heading_broken, f"Heading drift: {heading_broken}"
assert not citation_broken, f"Citation drift: {citation_broken}"