"""Numbered figure support for LEVEL3 markidocx (FR-532, FR-541). Handles round-trip of captioned numbered figures between Markdown and DOCX. Markdown syntax: ![Caption text](path/to/image.png){#fig:label} DOCX representation: [image paragraph or placeholder] [caption paragraph: "Figure N — Caption text"] (with alt-text marker: "figure-source:path/to/image.png#fig:label") """ from __future__ import annotations import re from typing import TYPE_CHECKING if TYPE_CHECKING: from docx.document import Document as DocxDocument # Markdown figure pattern: ![Caption](path){#fig:label} FIGURE_RE = re.compile( r"^!\[([^\]]*)\]\(([^)]+)\)\{#(fig:[\w:-]+)\}$", re.MULTILINE, ) # Caption paragraph pattern in imported DOCX CAPTION_RE = re.compile(r"^Figure\s+(\d+)\s+[—\-–]\s+(.+)$") # Alt-text marker embedded in images to preserve source intent (FR-534) ALT_TEXT_MARKER_PREFIX = "figure-source:" def is_figure_paragraph(text: str) -> bool: """Return True if *text* is a standalone figure declaration.""" return bool(FIGURE_RE.match(text.strip())) def parse_figure(text: str) -> tuple[str, str, str] | None: """Parse a figure declaration. Returns (caption, path, label) or None. """ m = FIGURE_RE.match(text.strip()) if not m: return None return m.group(1), m.group(2), m.group(3) def render_figure( doc: DocxDocument, caption: str, path: str, label: str, figure_number: int, ) -> None: """Render a figure declaration into *doc* (FR-532). Adds: 1. A paragraph with alt-text marker (image placeholder — actual embedding requires the file to exist and is omitted here for portability). 2. A caption paragraph: "Figure N — Caption" """ # Alt-text marker so importer can reconstruct the figure (FR-534) alt_marker = f"{ALT_TEXT_MARKER_PREFIX}{path}#{label}" # Image placeholder paragraph with alt-text marker as text placeholder = doc.add_paragraph(style="Normal") run = placeholder.add_run(f"[Figure: {path}]") # Store source-intent in the run's text (alt-text equivalent for round-trip) run.italic = True # Add DOCX comment/marker paragraph with the source-intent data marker_para = doc.add_paragraph(style="Normal") marker_run = marker_para.add_run(alt_marker) marker_run.font.size = None # inherit # Hide the marker by making it very small (conceptual; keeps round-trip info) from docx.shared import Pt marker_run.font.size = Pt(1) marker_run.font.color.rgb = None # default color # Caption paragraph caption_para = doc.add_paragraph(style="Normal") caption_para.add_run(f"Figure {figure_number} — {caption}") def extract_figures_from_md(text: str) -> list[tuple[str, str, str]]: """Extract all figure declarations from Markdown text. Returns list of (caption, path, label). """ return [(m.group(1), m.group(2), m.group(3)) for m in FIGURE_RE.finditer(text)] # --------------------------------------------------------------------------- # Importer helpers # --------------------------------------------------------------------------- def is_caption_paragraph(text: str) -> bool: """Return True if *text* looks like a figure caption.""" return bool(CAPTION_RE.match(text.strip())) def is_alt_text_marker(text: str) -> bool: """Return True if *text* is a figure-source alt-text marker.""" return text.strip().startswith(ALT_TEXT_MARKER_PREFIX) def parse_alt_text_marker(text: str) -> tuple[str, str] | None: """Parse a figure-source marker into (path, label). Returns None if the text is not a valid marker. """ stripped = text.strip() if not stripped.startswith(ALT_TEXT_MARKER_PREFIX): return None rest = stripped[len(ALT_TEXT_MARKER_PREFIX):] if "#" in rest: path, label = rest.rsplit("#", 1) return path, label return rest, "" def reconstruct_figure_md(caption: str, path: str, label: str) -> str: """Reconstruct a Markdown figure declaration from its parts.""" return f"![{caption}]({path}){{#{label}}}" # --------------------------------------------------------------------------- # Differ helpers # --------------------------------------------------------------------------- FIGURE_LABEL_RE = re.compile(r"\{#(fig:[\w:-]+)\}") FIGURE_CAPTION_MD_RE = re.compile(r"!\[([^\]]*)\]\([^)]+\)\{#fig:[\w:-]+\}") def extract_figure_labels(text: str) -> set[str]: """Extract {#fig:label} declarations from Markdown text.""" return set(FIGURE_LABEL_RE.findall(text)) def extract_figure_captions(text: str) -> list[str]: """Extract captions from figure declarations in Markdown text.""" return [m.group(1) for m in FIGURE_CAPTION_MD_RE.finditer(text)]