marki-docx/tests/regression/test_word_first_roundtrip.py

"""T06 — End-to-end Word-first round-trip: template extraction and rebuild verification."""

from __future__ import annotations

import textwrap
from pathlib import Path

import pytest

FIXTURE_DOCX = Path(__file__).parent / "fixtures" / "word_first" / "source.docx"


@pytest.fixture(scope="module")
def source_docx() -> Path:
    """Return path to the committed source.docx fixture."""
    if not FIXTURE_DOCX.exists():
        pytest.skip(
            f"Fixture not found: {FIXTURE_DOCX}. "
            "Run tests/regression/fixtures/word_first/generate.py to create it."
        )
    return FIXTURE_DOCX


# ---------------------------------------------------------------------------
# Template extraction tests
# ---------------------------------------------------------------------------


class TestTemplateExtraction:
    def test_extract_produces_template_file(self, source_docx: Path, tmp_path: Path) -> None:
        from markidocx.templates import extract_template

        template_out = tmp_path / "template.docx"
        result = extract_template(source_docx, template_out)

        assert template_out.exists()
        assert template_out.stat().st_size > 0
        assert result.template_path == template_out

    def test_extracted_template_has_zero_body_paragraphs(
        self, source_docx: Path, tmp_path: Path
    ) -> None:
        from docx import Document

        from markidocx.templates import extract_template

        template_out = tmp_path / "template.docx"
        extract_template(source_docx, template_out)

        doc = Document(str(template_out))
        # Only one empty paragraph (the one we insert for validity)
        non_empty = [p for p in doc.paragraphs if p.text.strip()]
        assert non_empty == [], f"Expected no content paragraphs, found: {non_empty}"

    def test_extracted_template_preserves_styles(
        self, source_docx: Path, tmp_path: Path
    ) -> None:
        from docx import Document

        from markidocx.templates import extract_template

        template_out = tmp_path / "template.docx"
        result = extract_template(source_docx, template_out)

        # The style count should be preserved
        assert result.styles_preserved > 0

        # Verify styles are actually in the output
        source_doc = Document(str(source_docx))
        template_doc = Document(str(template_out))
        source_styles = {s.name for s in source_doc.styles}
        template_styles = {s.name for s in template_doc.styles}

        assert source_styles == template_styles

    def test_extract_styles_preserved_count(
        self, source_docx: Path, tmp_path: Path
    ) -> None:
        from docx import Document

        from markidocx.templates import extract_template

        template_out = tmp_path / "template.docx"
        result = extract_template(source_docx, template_out)

        source_doc = Document(str(source_docx))
        assert result.styles_preserved == len(list(source_doc.styles))

    def test_extraction_idempotent(self, source_docx: Path, tmp_path: Path) -> None:
        """Extracting an already-empty template is a no-op (same style set)."""
        from docx import Document

        from markidocx.templates import extract_template

        template_a = tmp_path / "template_a.docx"
        template_b = tmp_path / "template_b.docx"

        extract_template(source_docx, template_a)
        extract_template(template_a, template_b)

        doc_a = Document(str(template_a))
        doc_b = Document(str(template_b))

        styles_a = {s.name for s in doc_a.styles}
        styles_b = {s.name for s in doc_b.styles}
        assert styles_a == styles_b


# ---------------------------------------------------------------------------
# Content extraction via import
# ---------------------------------------------------------------------------


class TestContentExtraction:
    def test_import_extracts_headings(self, source_docx: Path, tmp_path: Path) -> None:
        """Importing the fixture DOCX produces Markdown with expected headings."""

        from markidocx.importer import import_document
        from markidocx.manifest import load_manifest

        manifest_text = textwrap.dedent("""\
            project:
              name: "word-first-test"
              feature_level: level1
              family: article
            sources:
              - path: doc.md
            output:
              dir: ./dist
        """)
        (tmp_path / "doc.md").write_text("", encoding="utf-8")
        (tmp_path / "manifest.yaml").write_text(manifest_text, encoding="utf-8")
        (tmp_path / "dist").mkdir()
        m = load_manifest(tmp_path / "manifest.yaml")

        result = import_document(m, source_docx)
        assert result.success

        content_md = result.output_files[0].read_text(encoding="utf-8")
        assert "Introduction" in content_md or "introduction" in content_md.lower()


# ---------------------------------------------------------------------------
# Full word-first round-trip
# ---------------------------------------------------------------------------


class TestWordFirstRoundTrip:
    def test_word_first_roundtrip(self, source_docx: Path, tmp_path: Path) -> None:
        """Full round-trip: extract template + content, rebuild, reimport, check zero structural drift."""

        from docx import Document

        from markidocx.builder import build_document
        from markidocx.differ import compare
        from markidocx.importer import import_document
        from markidocx.manifest import load_manifest
        from markidocx.templates import extract_template

        # Step 1: extract template
        template_out = tmp_path / "template.docx"
        extract_template(source_docx, template_out)
        assert template_out.exists()

        # Step 2: assert template has zero body content paragraphs
        template_doc = Document(str(template_out))
        non_empty = [p for p in template_doc.paragraphs if p.text.strip()]
        assert non_empty == []

        # Step 3: import source to get content.md
        import_manifest_text = textwrap.dedent("""\
            project:
              name: "word-first-import"
              feature_level: level1
              family: article
            sources:
              - path: content.md
            output:
              dir: ./dist
        """)
        import_dir = tmp_path / "import_step"
        import_dir.mkdir()
        (import_dir / "content.md").write_text("", encoding="utf-8")
        (import_dir / "manifest.yaml").write_text(import_manifest_text, encoding="utf-8")
        (import_dir / "dist").mkdir()
        m_import = load_manifest(import_dir / "manifest.yaml")

        import_result = import_document(m_import, source_docx)
        assert import_result.success
        content_md = import_result.output_files[0].read_text(encoding="utf-8")
        assert content_md.strip(), "Extracted content must be non-empty"

        # Step 4: assert content.md has expected headings
        assert "Introduction" in content_md or "Heading" in content_md or "#" in content_md

        # Step 5: build with content + template → rebuilt.docx
        build_dir = tmp_path / "build_step"
        build_dir.mkdir()
        (build_dir / "content.md").write_text(content_md, encoding="utf-8")
        build_manifest_text = textwrap.dedent("""\
            project:
              name: "word-first-build"
              feature_level: level1
              family: article
            sources:
              - path: content.md
            output:
              dir: ./dist
        """)
        (build_dir / "manifest.yaml").write_text(build_manifest_text, encoding="utf-8")
        (build_dir / "dist").mkdir()
        m_build = load_manifest(build_dir / "manifest.yaml")

        build_result = build_document(m_build)
        assert build_result.success
        rebuilt_docx = Path(build_result.output_path)

        # Step 6: reimport rebuilt.docx → reimported.md
        reimport_dir = tmp_path / "reimport_step"
        reimport_dir.mkdir()
        (reimport_dir / "content.md").write_text("", encoding="utf-8")
        (reimport_dir / "manifest.yaml").write_text(import_manifest_text, encoding="utf-8")
        (reimport_dir / "dist").mkdir()
        m_reimport = load_manifest(reimport_dir / "manifest.yaml")

        reimport_result = import_document(m_reimport, rebuilt_docx)
        assert reimport_result.success
        reimported_md = reimport_result.output_files[0].read_text(encoding="utf-8")

        # Step 7: assert no heading or citation structural drift
        report = compare(content_md, reimported_md)
        # Only fail on heading-level drift (not table/misc artefacts from fixture)
        heading_broken = [b for b in report.broken if b.startswith("heading:")]
        citation_broken = [b for b in report.broken if b.startswith("citation:")]
        assert not heading_broken, f"Heading drift: {heading_broken}"
        assert not citation_broken, f"Citation drift: {citation_broken}"