marki-docx/src/markidocx/importer.py

"""DOCX→Markdown importer for markidocx (FR-300, FR-400)."""

from __future__ import annotations

import re
from dataclasses import dataclass, field
from pathlib import Path

from docx import Document
from docx.document import Document as DocxDocument
from docx.table import Table
from docx.text.paragraph import Paragraph

from markidocx.manifest import Manifest

HEADING_STYLE_RE = re.compile(r"^Heading (\d+)$", re.IGNORECASE)
LIST_BULLET_RE = re.compile(r"^List Bullet", re.IGNORECASE)
LIST_NUMBER_RE = re.compile(r"^List Number", re.IGNORECASE)


@dataclass
class ImportResult:
    success: bool
    output_files: list[Path]
    mapping_status: str  # "redistributed" | "merged" | "failed"
    warnings: list[str] = field(default_factory=list)


def import_document(manifest: Manifest, docx_path: Path) -> ImportResult:
    """Import *docx_path* and write Markdown back to the project sources.

    If multiple source files exist and section boundaries can be detected,
    content is redistributed to the original files.  Otherwise a single
    merged file is produced.
    """
    warnings: list[str] = []

    if not docx_path.exists():
        return ImportResult(
            success=False,
            output_files=[],
            mapping_status="failed",
            warnings=[f"DOCX file not found: {docx_path}"],
        )

    try:
        doc = Document(str(docx_path))
    except Exception as exc:
        return ImportResult(
            success=False,
            output_files=[],
            mapping_status="failed",
            warnings=[f"Could not open DOCX: {exc}"],
        )

    md_text = _docx_to_markdown(doc, warnings)

    manifest.output_dir.mkdir(parents=True, exist_ok=True)

    # Attempt redistribution to source files (FR-305, FR-405)
    if len(manifest.sources) == 1:
        out_path = manifest.sources[0].path
        out_path.write_text(md_text, encoding="utf-8")
        return ImportResult(
            success=True,
            output_files=[out_path],
            mapping_status="redistributed",
            warnings=warnings,
        )

    # Multi-file: attempt redistribution by H1 boundary
    sections = _split_by_h1(md_text)
    if len(sections) == len(manifest.sources):
        output_files: list[Path] = []
        for src, section_text in zip(manifest.sources, sections, strict=True):
            src.path.write_text(section_text, encoding="utf-8")
            output_files.append(src.path)
        return ImportResult(
            success=True,
            output_files=output_files,
            mapping_status="redistributed",
            warnings=warnings,
        )

    # Fallback: merged single output (FR-406)
    warnings.append(
        f"Could not redistribute to {len(manifest.sources)} source files "
        f"(found {len(sections)} H1 sections); writing merged output"
    )
    merged_path = manifest.output_dir / "imported_merged.md"
    merged_path.write_text(md_text, encoding="utf-8")
    return ImportResult(
        success=True,
        output_files=[merged_path],
        mapping_status="merged",
        warnings=warnings,
    )


# ---------------------------------------------------------------------------
# DOCX → Markdown conversion
# ---------------------------------------------------------------------------

def _docx_to_markdown(doc: DocxDocument, warnings: list[str]) -> str:
    """Convert a python-docx Document to a Markdown string."""
    lines: list[str] = []
    # Walk python-docx's block-level items
    for block in _iter_blocks(doc):
        if isinstance(block, Paragraph):
            md = _paragraph_to_md(block, warnings)
            if md is not None:
                lines.append(md)
        elif isinstance(block, Table):
            lines.append(_table_to_md(block))

    return "\n\n".join(line for line in lines if line is not None)


def _iter_blocks(doc: DocxDocument):
    """Yield Paragraph and Table objects from the document body in order."""

    body = doc.element.body
    for child in body:
        tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
        if tag == "p":
            yield Paragraph(child, doc)
        elif tag == "tbl":
            yield Table(child, doc)


def _paragraph_to_md(para: Paragraph, warnings: list[str]) -> str | None:
    """Convert a paragraph to a Markdown line."""
    style_name = para.style.name if para.style else "Normal"
    text = para.text.strip()

    # Headings
    m = HEADING_STYLE_RE.match(style_name)
    if m:
        level = int(m.group(1))
        return f"{'#' * level} {text}"

    # Lists
    if LIST_BULLET_RE.match(style_name):
        return f"- {text}"
    if LIST_NUMBER_RE.match(style_name):
        return f"1. {text}"

    # Normal text — preserve inline markup
    if not text:
        return None

    return _runs_to_md(para)


def _runs_to_md(para: Paragraph) -> str:
    """Convert paragraph runs to Markdown with inline formatting."""
    parts: list[str] = []
    for run in para.runs:
        text = run.text
        if not text:
            continue
        if run.bold and run.italic:
            text = f"***{text}***"
        elif run.bold:
            text = f"**{text}**"
        elif run.italic:
            text = f"*{text}*"
        elif run.font.name and "Courier" in run.font.name:
            text = f"`{text}`"
        parts.append(text)
    return "".join(parts)


def _table_to_md(table: Table) -> str:
    """Convert a DOCX table to a GFM Markdown table."""
    rows = table.rows
    if not rows:
        return ""

    cells_per_row = [
        [cell.text.strip().replace("|", "\\|") for cell in row.cells]
        for row in rows
    ]

    # Normalise column count
    num_cols = max(len(r) for r in cells_per_row)
    for row in cells_per_row:
        while len(row) < num_cols:
            row.append("")

    lines: list[str] = []
    header = "| " + " | ".join(cells_per_row[0]) + " |"
    separator = "| " + " | ".join(["---"] * num_cols) + " |"
    lines.append(header)
    lines.append(separator)
    for row in cells_per_row[1:]:
        lines.append("| " + " | ".join(row) + " |")

    return "\n".join(lines)


def _split_by_h1(md_text: str) -> list[str]:
    """Split Markdown text into sections at H1 boundaries."""
    lines = md_text.split("\n\n")
    sections: list[str] = []
    current: list[str] = []

    for chunk in lines:
        if chunk.startswith("# ") and current:
            sections.append("\n\n".join(current))
            current = [chunk]
        else:
            current.append(chunk)

    if current:
        sections.append("\n\n".join(current))

    return sections