"""T06 — End-to-end Word-first round-trip: template extraction and rebuild verification.""" from __future__ import annotations import textwrap from pathlib import Path import pytest FIXTURE_DOCX = Path(__file__).parent / "fixtures" / "word_first" / "source.docx" @pytest.fixture(scope="module") def source_docx() -> Path: """Return path to the committed source.docx fixture.""" if not FIXTURE_DOCX.exists(): pytest.skip( f"Fixture not found: {FIXTURE_DOCX}. " "Run tests/regression/fixtures/word_first/generate.py to create it." ) return FIXTURE_DOCX # --------------------------------------------------------------------------- # Template extraction tests # --------------------------------------------------------------------------- class TestTemplateExtraction: def test_extract_produces_template_file(self, source_docx: Path, tmp_path: Path) -> None: from markidocx.templates import extract_template template_out = tmp_path / "template.docx" result = extract_template(source_docx, template_out) assert template_out.exists() assert template_out.stat().st_size > 0 assert result.template_path == template_out def test_extracted_template_has_zero_body_paragraphs( self, source_docx: Path, tmp_path: Path ) -> None: from docx import Document from markidocx.templates import extract_template template_out = tmp_path / "template.docx" extract_template(source_docx, template_out) doc = Document(str(template_out)) # Only one empty paragraph (the one we insert for validity) non_empty = [p for p in doc.paragraphs if p.text.strip()] assert non_empty == [], f"Expected no content paragraphs, found: {non_empty}" def test_extracted_template_preserves_styles( self, source_docx: Path, tmp_path: Path ) -> None: from docx import Document from markidocx.templates import extract_template template_out = tmp_path / "template.docx" result = extract_template(source_docx, template_out) # The style count should be preserved assert result.styles_preserved > 0 # Verify styles are actually in the output source_doc = Document(str(source_docx)) template_doc = Document(str(template_out)) source_styles = {s.name for s in source_doc.styles} template_styles = {s.name for s in template_doc.styles} assert source_styles == template_styles def test_extract_styles_preserved_count( self, source_docx: Path, tmp_path: Path ) -> None: from docx import Document from markidocx.templates import extract_template template_out = tmp_path / "template.docx" result = extract_template(source_docx, template_out) source_doc = Document(str(source_docx)) assert result.styles_preserved == len(list(source_doc.styles)) def test_extraction_idempotent(self, source_docx: Path, tmp_path: Path) -> None: """Extracting an already-empty template is a no-op (same style set).""" from docx import Document from markidocx.templates import extract_template template_a = tmp_path / "template_a.docx" template_b = tmp_path / "template_b.docx" extract_template(source_docx, template_a) extract_template(template_a, template_b) doc_a = Document(str(template_a)) doc_b = Document(str(template_b)) styles_a = {s.name for s in doc_a.styles} styles_b = {s.name for s in doc_b.styles} assert styles_a == styles_b # --------------------------------------------------------------------------- # Content extraction via import # --------------------------------------------------------------------------- class TestContentExtraction: def test_import_extracts_headings(self, source_docx: Path, tmp_path: Path) -> None: """Importing the fixture DOCX produces Markdown with expected headings.""" from markidocx.importer import import_document from markidocx.manifest import load_manifest manifest_text = textwrap.dedent("""\ project: name: "word-first-test" feature_level: level1 family: article sources: - path: doc.md output: dir: ./dist """) (tmp_path / "doc.md").write_text("", encoding="utf-8") (tmp_path / "manifest.yaml").write_text(manifest_text, encoding="utf-8") (tmp_path / "dist").mkdir() m = load_manifest(tmp_path / "manifest.yaml") result = import_document(m, source_docx) assert result.success content_md = result.output_files[0].read_text(encoding="utf-8") assert "Introduction" in content_md or "introduction" in content_md.lower() # --------------------------------------------------------------------------- # Full word-first round-trip # --------------------------------------------------------------------------- class TestWordFirstRoundTrip: def test_word_first_roundtrip(self, source_docx: Path, tmp_path: Path) -> None: """Full round-trip: extract template + content, rebuild, reimport, check zero structural drift.""" from docx import Document from markidocx.builder import build_document from markidocx.differ import compare from markidocx.importer import import_document from markidocx.manifest import load_manifest from markidocx.templates import extract_template # Step 1: extract template template_out = tmp_path / "template.docx" extract_template(source_docx, template_out) assert template_out.exists() # Step 2: assert template has zero body content paragraphs template_doc = Document(str(template_out)) non_empty = [p for p in template_doc.paragraphs if p.text.strip()] assert non_empty == [] # Step 3: import source to get content.md import_manifest_text = textwrap.dedent("""\ project: name: "word-first-import" feature_level: level1 family: article sources: - path: content.md output: dir: ./dist """) import_dir = tmp_path / "import_step" import_dir.mkdir() (import_dir / "content.md").write_text("", encoding="utf-8") (import_dir / "manifest.yaml").write_text(import_manifest_text, encoding="utf-8") (import_dir / "dist").mkdir() m_import = load_manifest(import_dir / "manifest.yaml") import_result = import_document(m_import, source_docx) assert import_result.success content_md = import_result.output_files[0].read_text(encoding="utf-8") assert content_md.strip(), "Extracted content must be non-empty" # Step 4: assert content.md has expected headings assert "Introduction" in content_md or "Heading" in content_md or "#" in content_md # Step 5: build with content + template → rebuilt.docx build_dir = tmp_path / "build_step" build_dir.mkdir() (build_dir / "content.md").write_text(content_md, encoding="utf-8") build_manifest_text = textwrap.dedent("""\ project: name: "word-first-build" feature_level: level1 family: article sources: - path: content.md output: dir: ./dist """) (build_dir / "manifest.yaml").write_text(build_manifest_text, encoding="utf-8") (build_dir / "dist").mkdir() m_build = load_manifest(build_dir / "manifest.yaml") build_result = build_document(m_build) assert build_result.success rebuilt_docx = Path(build_result.output_path) # Step 6: reimport rebuilt.docx → reimported.md reimport_dir = tmp_path / "reimport_step" reimport_dir.mkdir() (reimport_dir / "content.md").write_text("", encoding="utf-8") (reimport_dir / "manifest.yaml").write_text(import_manifest_text, encoding="utf-8") (reimport_dir / "dist").mkdir() m_reimport = load_manifest(reimport_dir / "manifest.yaml") reimport_result = import_document(m_reimport, rebuilt_docx) assert reimport_result.success reimported_md = reimport_result.output_files[0].read_text(encoding="utf-8") # Step 7: assert no heading or citation structural drift report = compare(content_md, reimported_md) # Only fail on heading-level drift (not table/misc artefacts from fixture) heading_broken = [b for b in report.broken if b.startswith("heading:")] citation_broken = [b for b in report.broken if b.startswith("citation:")] assert not heading_broken, f"Heading drift: {heading_broken}" assert not citation_broken, f"Citation drift: {citation_broken}"