marki-docx/src/markidocx/figures.py

"""Numbered figure support for LEVEL3 markidocx (FR-532, FR-541).

Handles round-trip of captioned numbered figures between Markdown and DOCX.

Markdown syntax:
    ![Caption text](path/to/image.png){#fig:label}

DOCX representation:
    [image paragraph or placeholder]
    [caption paragraph: "Figure N — Caption text"]
    (with alt-text marker: "figure-source:path/to/image.png#fig:label")
"""

from __future__ import annotations

import re
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from docx.document import Document as DocxDocument

# Markdown figure pattern: ![Caption](path){#fig:label}
FIGURE_RE = re.compile(
    r"^!\[([^\]]*)\]\(([^)]+)\)\{#(fig:[\w:-]+)\}$",
    re.MULTILINE,
)

# Caption paragraph pattern in imported DOCX
CAPTION_RE = re.compile(r"^Figure\s+(\d+)\s+[—\-–]\s+(.+)$")

# Alt-text marker embedded in images to preserve source intent (FR-534)
ALT_TEXT_MARKER_PREFIX = "figure-source:"


def is_figure_paragraph(text: str) -> bool:
    """Return True if *text* is a standalone figure declaration."""
    return bool(FIGURE_RE.match(text.strip()))


def parse_figure(text: str) -> tuple[str, str, str] | None:
    """Parse a figure declaration.

    Returns (caption, path, label) or None.
    """
    m = FIGURE_RE.match(text.strip())
    if not m:
        return None
    return m.group(1), m.group(2), m.group(3)


def render_figure(
    doc: DocxDocument,
    caption: str,
    path: str,
    label: str,
    figure_number: int,
) -> None:
    """Render a figure declaration into *doc* (FR-532).

    Adds:
    1. A paragraph with alt-text marker (image placeholder — actual embedding
       requires the file to exist and is omitted here for portability).
    2. A caption paragraph: "Figure N — Caption"
    """
    # Alt-text marker so importer can reconstruct the figure (FR-534)
    alt_marker = f"{ALT_TEXT_MARKER_PREFIX}{path}#{label}"

    # Image placeholder paragraph with alt-text marker as text
    placeholder = doc.add_paragraph(style="Normal")
    run = placeholder.add_run(f"[Figure: {path}]")
    # Store source-intent in the run's text (alt-text equivalent for round-trip)
    run.italic = True

    # Add DOCX comment/marker paragraph with the source-intent data
    marker_para = doc.add_paragraph(style="Normal")
    marker_run = marker_para.add_run(alt_marker)
    marker_run.font.size = None  # inherit
    # Hide the marker by making it very small (conceptual; keeps round-trip info)
    from docx.shared import Pt

    marker_run.font.size = Pt(1)
    marker_run.font.color.rgb = None  # default color

    # Caption paragraph
    caption_para = doc.add_paragraph(style="Normal")
    caption_para.add_run(f"Figure {figure_number} — {caption}")


def extract_figures_from_md(text: str) -> list[tuple[str, str, str]]:
    """Extract all figure declarations from Markdown text.

    Returns list of (caption, path, label).
    """
    return [(m.group(1), m.group(2), m.group(3)) for m in FIGURE_RE.finditer(text)]


# ---------------------------------------------------------------------------
# Importer helpers
# ---------------------------------------------------------------------------


def is_caption_paragraph(text: str) -> bool:
    """Return True if *text* looks like a figure caption."""
    return bool(CAPTION_RE.match(text.strip()))


def is_alt_text_marker(text: str) -> bool:
    """Return True if *text* is a figure-source alt-text marker."""
    return text.strip().startswith(ALT_TEXT_MARKER_PREFIX)


def parse_alt_text_marker(text: str) -> tuple[str, str] | None:
    """Parse a figure-source marker into (path, label).

    Returns None if the text is not a valid marker.
    """
    stripped = text.strip()
    if not stripped.startswith(ALT_TEXT_MARKER_PREFIX):
        return None
    rest = stripped[len(ALT_TEXT_MARKER_PREFIX):]
    if "#" in rest:
        path, label = rest.rsplit("#", 1)
        return path, label
    return rest, ""


def reconstruct_figure_md(caption: str, path: str, label: str) -> str:
    """Reconstruct a Markdown figure declaration from its parts."""
    return f"![{caption}]({path}){{#{label}}}"


# ---------------------------------------------------------------------------
# Differ helpers
# ---------------------------------------------------------------------------

FIGURE_LABEL_RE = re.compile(r"\{#(fig:[\w:-]+)\}")
FIGURE_CAPTION_MD_RE = re.compile(r"!\[([^\]]*)\]\([^)]+\)\{#fig:[\w:-]+\}")


def extract_figure_labels(text: str) -> set[str]:
    """Extract {#fig:label} declarations from Markdown text."""
    return set(FIGURE_LABEL_RE.findall(text))


def extract_figure_captions(text: str) -> list[str]:
    """Extract captions from figure declarations in Markdown text."""
    return [m.group(1) for m in FIGURE_CAPTION_MD_RE.finditer(text)]