generated from coulomb/repo-seed
T01: markidocx inspect (FR-806) and markidocx test (FR-810) CLI commands
T02: markidocx evidence get/list CLI commands (FR-1409, FR-814)
T03: list_styles() / GET /styles / MCP list_styles with real style data (FR-907)
T04: Evidence assembly — EvidenceSet summary via REST and MCP (FR-1406–1408)
T05: LEVEL3 edge-case tests — diagram mutation, renderer version check,
bibliography duplicate keys / missing refs / special chars (FR-534, FR-538, FR-542)
T06: markidocx template extract + Word-first round-trip regression test (FR-606)
New: differ._compare_diagram_blocks tracks fenced diagram source drift (FR-534)
New: diagrams.check_renderer_version emits warning for outdated renderers (FR-538)
New: bibliography.validate_citations detects duplicate keys and missing entries (FR-542)
New: templates.extract_template / TemplateExtractionResult / list_styles / StyleEntry
New: REST POST /template/extract; MCP extract_template tool
278 tests pass, ruff+mypy clean.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
237 lines
8.8 KiB
Python
237 lines
8.8 KiB
Python
"""T06 — End-to-end Word-first round-trip: template extraction and rebuild verification."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import textwrap
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
FIXTURE_DOCX = Path(__file__).parent / "fixtures" / "word_first" / "source.docx"
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def source_docx() -> Path:
|
|
"""Return path to the committed source.docx fixture."""
|
|
if not FIXTURE_DOCX.exists():
|
|
pytest.skip(
|
|
f"Fixture not found: {FIXTURE_DOCX}. "
|
|
"Run tests/regression/fixtures/word_first/generate.py to create it."
|
|
)
|
|
return FIXTURE_DOCX
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Template extraction tests
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestTemplateExtraction:
|
|
def test_extract_produces_template_file(self, source_docx: Path, tmp_path: Path) -> None:
|
|
from markidocx.templates import extract_template
|
|
|
|
template_out = tmp_path / "template.docx"
|
|
result = extract_template(source_docx, template_out)
|
|
|
|
assert template_out.exists()
|
|
assert template_out.stat().st_size > 0
|
|
assert result.template_path == template_out
|
|
|
|
def test_extracted_template_has_zero_body_paragraphs(
|
|
self, source_docx: Path, tmp_path: Path
|
|
) -> None:
|
|
from docx import Document
|
|
|
|
from markidocx.templates import extract_template
|
|
|
|
template_out = tmp_path / "template.docx"
|
|
extract_template(source_docx, template_out)
|
|
|
|
doc = Document(str(template_out))
|
|
# Only one empty paragraph (the one we insert for validity)
|
|
non_empty = [p for p in doc.paragraphs if p.text.strip()]
|
|
assert non_empty == [], f"Expected no content paragraphs, found: {non_empty}"
|
|
|
|
def test_extracted_template_preserves_styles(
|
|
self, source_docx: Path, tmp_path: Path
|
|
) -> None:
|
|
from docx import Document
|
|
|
|
from markidocx.templates import extract_template
|
|
|
|
template_out = tmp_path / "template.docx"
|
|
result = extract_template(source_docx, template_out)
|
|
|
|
# The style count should be preserved
|
|
assert result.styles_preserved > 0
|
|
|
|
# Verify styles are actually in the output
|
|
source_doc = Document(str(source_docx))
|
|
template_doc = Document(str(template_out))
|
|
source_styles = {s.name for s in source_doc.styles}
|
|
template_styles = {s.name for s in template_doc.styles}
|
|
|
|
assert source_styles == template_styles
|
|
|
|
def test_extract_styles_preserved_count(
|
|
self, source_docx: Path, tmp_path: Path
|
|
) -> None:
|
|
from docx import Document
|
|
|
|
from markidocx.templates import extract_template
|
|
|
|
template_out = tmp_path / "template.docx"
|
|
result = extract_template(source_docx, template_out)
|
|
|
|
source_doc = Document(str(source_docx))
|
|
assert result.styles_preserved == len(list(source_doc.styles))
|
|
|
|
def test_extraction_idempotent(self, source_docx: Path, tmp_path: Path) -> None:
|
|
"""Extracting an already-empty template is a no-op (same style set)."""
|
|
from docx import Document
|
|
|
|
from markidocx.templates import extract_template
|
|
|
|
template_a = tmp_path / "template_a.docx"
|
|
template_b = tmp_path / "template_b.docx"
|
|
|
|
extract_template(source_docx, template_a)
|
|
extract_template(template_a, template_b)
|
|
|
|
doc_a = Document(str(template_a))
|
|
doc_b = Document(str(template_b))
|
|
|
|
styles_a = {s.name for s in doc_a.styles}
|
|
styles_b = {s.name for s in doc_b.styles}
|
|
assert styles_a == styles_b
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Content extraction via import
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestContentExtraction:
|
|
def test_import_extracts_headings(self, source_docx: Path, tmp_path: Path) -> None:
|
|
"""Importing the fixture DOCX produces Markdown with expected headings."""
|
|
|
|
from markidocx.importer import import_document
|
|
from markidocx.manifest import load_manifest
|
|
|
|
manifest_text = textwrap.dedent("""\
|
|
project:
|
|
name: "word-first-test"
|
|
feature_level: level1
|
|
family: article
|
|
sources:
|
|
- path: doc.md
|
|
output:
|
|
dir: ./dist
|
|
""")
|
|
(tmp_path / "doc.md").write_text("", encoding="utf-8")
|
|
(tmp_path / "manifest.yaml").write_text(manifest_text, encoding="utf-8")
|
|
(tmp_path / "dist").mkdir()
|
|
m = load_manifest(tmp_path / "manifest.yaml")
|
|
|
|
result = import_document(m, source_docx)
|
|
assert result.success
|
|
|
|
content_md = result.output_files[0].read_text(encoding="utf-8")
|
|
assert "Introduction" in content_md or "introduction" in content_md.lower()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Full word-first round-trip
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class TestWordFirstRoundTrip:
|
|
def test_word_first_roundtrip(self, source_docx: Path, tmp_path: Path) -> None:
|
|
"""Full round-trip: extract template + content, rebuild, reimport, check zero structural drift."""
|
|
|
|
from docx import Document
|
|
|
|
from markidocx.builder import build_document
|
|
from markidocx.differ import compare
|
|
from markidocx.importer import import_document
|
|
from markidocx.manifest import load_manifest
|
|
from markidocx.templates import extract_template
|
|
|
|
# Step 1: extract template
|
|
template_out = tmp_path / "template.docx"
|
|
extract_template(source_docx, template_out)
|
|
assert template_out.exists()
|
|
|
|
# Step 2: assert template has zero body content paragraphs
|
|
template_doc = Document(str(template_out))
|
|
non_empty = [p for p in template_doc.paragraphs if p.text.strip()]
|
|
assert non_empty == []
|
|
|
|
# Step 3: import source to get content.md
|
|
import_manifest_text = textwrap.dedent("""\
|
|
project:
|
|
name: "word-first-import"
|
|
feature_level: level1
|
|
family: article
|
|
sources:
|
|
- path: content.md
|
|
output:
|
|
dir: ./dist
|
|
""")
|
|
import_dir = tmp_path / "import_step"
|
|
import_dir.mkdir()
|
|
(import_dir / "content.md").write_text("", encoding="utf-8")
|
|
(import_dir / "manifest.yaml").write_text(import_manifest_text, encoding="utf-8")
|
|
(import_dir / "dist").mkdir()
|
|
m_import = load_manifest(import_dir / "manifest.yaml")
|
|
|
|
import_result = import_document(m_import, source_docx)
|
|
assert import_result.success
|
|
content_md = import_result.output_files[0].read_text(encoding="utf-8")
|
|
assert content_md.strip(), "Extracted content must be non-empty"
|
|
|
|
# Step 4: assert content.md has expected headings
|
|
assert "Introduction" in content_md or "Heading" in content_md or "#" in content_md
|
|
|
|
# Step 5: build with content + template → rebuilt.docx
|
|
build_dir = tmp_path / "build_step"
|
|
build_dir.mkdir()
|
|
(build_dir / "content.md").write_text(content_md, encoding="utf-8")
|
|
build_manifest_text = textwrap.dedent("""\
|
|
project:
|
|
name: "word-first-build"
|
|
feature_level: level1
|
|
family: article
|
|
sources:
|
|
- path: content.md
|
|
output:
|
|
dir: ./dist
|
|
""")
|
|
(build_dir / "manifest.yaml").write_text(build_manifest_text, encoding="utf-8")
|
|
(build_dir / "dist").mkdir()
|
|
m_build = load_manifest(build_dir / "manifest.yaml")
|
|
|
|
build_result = build_document(m_build)
|
|
assert build_result.success
|
|
rebuilt_docx = Path(build_result.output_path)
|
|
|
|
# Step 6: reimport rebuilt.docx → reimported.md
|
|
reimport_dir = tmp_path / "reimport_step"
|
|
reimport_dir.mkdir()
|
|
(reimport_dir / "content.md").write_text("", encoding="utf-8")
|
|
(reimport_dir / "manifest.yaml").write_text(import_manifest_text, encoding="utf-8")
|
|
(reimport_dir / "dist").mkdir()
|
|
m_reimport = load_manifest(reimport_dir / "manifest.yaml")
|
|
|
|
reimport_result = import_document(m_reimport, rebuilt_docx)
|
|
assert reimport_result.success
|
|
reimported_md = reimport_result.output_files[0].read_text(encoding="utf-8")
|
|
|
|
# Step 7: assert no heading or citation structural drift
|
|
report = compare(content_md, reimported_md)
|
|
# Only fail on heading-level drift (not table/misc artefacts from fixture)
|
|
heading_broken = [b for b in report.broken if b.startswith("heading:")]
|
|
citation_broken = [b for b in report.broken if b.startswith("citation:")]
|
|
assert not heading_broken, f"Heading drift: {heading_broken}"
|
|
assert not citation_broken, f"Citation drift: {citation_broken}"
|