Files
marki-docx/src/markidocx/figures.py
Bernd Worsch ac442ea41f feat: WP-0003 complete — LEVEL3 advanced features + error framework
Implements full LEVEL3 feature set: cross-references (xref.py), numbered
figures (figures.py), auto-diagrams (diagrams.py), bibliography/citations
(bibliography.py), LEVEL3 capability detection (level3.py), and structured
error/warning records (errors.py). Builder, importer, and differ updated for
LEVEL3 round-trip support. REST and MCP interfaces updated with structured
warning records. 259 tests passing.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-16 10:51:38 +00:00

148 lines
4.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Numbered figure support for LEVEL3 markidocx (FR-532, FR-541).
Handles round-trip of captioned numbered figures between Markdown and DOCX.
Markdown syntax:
![Caption text](path/to/image.png){#fig:label}
DOCX representation:
[image paragraph or placeholder]
[caption paragraph: "Figure N — Caption text"]
(with alt-text marker: "figure-source:path/to/image.png#fig:label")
"""
from __future__ import annotations
import re
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from docx.document import Document as DocxDocument
# Markdown figure pattern: ![Caption](path){#fig:label}
FIGURE_RE = re.compile(
r"^!\[([^\]]*)\]\(([^)]+)\)\{#(fig:[\w:-]+)\}$",
re.MULTILINE,
)
# Caption paragraph pattern in imported DOCX
CAPTION_RE = re.compile(r"^Figure\s+(\d+)\s+[—\-]\s+(.+)$")
# Alt-text marker embedded in images to preserve source intent (FR-534)
ALT_TEXT_MARKER_PREFIX = "figure-source:"
def is_figure_paragraph(text: str) -> bool:
"""Return True if *text* is a standalone figure declaration."""
return bool(FIGURE_RE.match(text.strip()))
def parse_figure(text: str) -> tuple[str, str, str] | None:
"""Parse a figure declaration.
Returns (caption, path, label) or None.
"""
m = FIGURE_RE.match(text.strip())
if not m:
return None
return m.group(1), m.group(2), m.group(3)
def render_figure(
doc: DocxDocument,
caption: str,
path: str,
label: str,
figure_number: int,
) -> None:
"""Render a figure declaration into *doc* (FR-532).
Adds:
1. A paragraph with alt-text marker (image placeholder — actual embedding
requires the file to exist and is omitted here for portability).
2. A caption paragraph: "Figure N — Caption"
"""
# Alt-text marker so importer can reconstruct the figure (FR-534)
alt_marker = f"{ALT_TEXT_MARKER_PREFIX}{path}#{label}"
# Image placeholder paragraph with alt-text marker as text
placeholder = doc.add_paragraph(style="Normal")
run = placeholder.add_run(f"[Figure: {path}]")
# Store source-intent in the run's text (alt-text equivalent for round-trip)
run.italic = True
# Add DOCX comment/marker paragraph with the source-intent data
marker_para = doc.add_paragraph(style="Normal")
marker_run = marker_para.add_run(alt_marker)
marker_run.font.size = None # inherit
# Hide the marker by making it very small (conceptual; keeps round-trip info)
from docx.shared import Pt
marker_run.font.size = Pt(1)
marker_run.font.color.rgb = None # default color
# Caption paragraph
caption_para = doc.add_paragraph(style="Normal")
caption_para.add_run(f"Figure {figure_number}{caption}")
def extract_figures_from_md(text: str) -> list[tuple[str, str, str]]:
"""Extract all figure declarations from Markdown text.
Returns list of (caption, path, label).
"""
return [(m.group(1), m.group(2), m.group(3)) for m in FIGURE_RE.finditer(text)]
# ---------------------------------------------------------------------------
# Importer helpers
# ---------------------------------------------------------------------------
def is_caption_paragraph(text: str) -> bool:
"""Return True if *text* looks like a figure caption."""
return bool(CAPTION_RE.match(text.strip()))
def is_alt_text_marker(text: str) -> bool:
"""Return True if *text* is a figure-source alt-text marker."""
return text.strip().startswith(ALT_TEXT_MARKER_PREFIX)
def parse_alt_text_marker(text: str) -> tuple[str, str] | None:
"""Parse a figure-source marker into (path, label).
Returns None if the text is not a valid marker.
"""
stripped = text.strip()
if not stripped.startswith(ALT_TEXT_MARKER_PREFIX):
return None
rest = stripped[len(ALT_TEXT_MARKER_PREFIX):]
if "#" in rest:
path, label = rest.rsplit("#", 1)
return path, label
return rest, ""
def reconstruct_figure_md(caption: str, path: str, label: str) -> str:
"""Reconstruct a Markdown figure declaration from its parts."""
return f"![{caption}]({path}){{#{label}}}"
# ---------------------------------------------------------------------------
# Differ helpers
# ---------------------------------------------------------------------------
FIGURE_LABEL_RE = re.compile(r"\{#(fig:[\w:-]+)\}")
FIGURE_CAPTION_MD_RE = re.compile(r"!\[([^\]]*)\]\([^)]+\)\{#fig:[\w:-]+\}")
def extract_figure_labels(text: str) -> set[str]:
"""Extract {#fig:label} declarations from Markdown text."""
return set(FIGURE_LABEL_RE.findall(text))
def extract_figure_captions(text: str) -> list[str]:
"""Extract captions from figure declarations in Markdown text."""
return [m.group(1) for m in FIGURE_CAPTION_MD_RE.finditer(text)]