Parse Markdown files into a structured Python object

2026-05-03 21:37:00 +02:00
parent 2676994b11
commit 705f2c6178
15 changed files with 571 additions and 8 deletions
--- a/src/markitect_tool/init.py
+++ b/src/markitect_tool/init.py
@@ -0,0 +1,21 @@
+"""Structured markdown primitives for markitect-tool."""
+
+from markitect_tool.core import (
+    ContentBlock,
+    Document,
+    Heading,
+    MarkdownParseError,
+    Section,
+    parse_markdown,
+    parse_markdown_file,
+)
+
+__all__ = [
+    "ContentBlock",
+    "Document",
+    "Heading",
+    "MarkdownParseError",
+    "Section",
+    "parse_markdown",
+    "parse_markdown_file",
+]
--- a/src/markitect_tool/main.py
+++ b/src/markitect_tool/main.py
@@ -0,0 +1,6 @@
+"""Run the `mkt` CLI with `python -m markitect_tool`."""
+
+from markitect_tool.cli import main
+
+
+main()
--- a/src/markitect_tool/cli/init.py
+++ b/src/markitect_tool/cli/init.py
@@ -0,0 +1,5 @@
+"""Command-line interface for markitect-tool."""
+
+from markitect_tool.cli.main import main
+
+__all__ = ["main"]
--- a/src/markitect_tool/cli/main.py
+++ b/src/markitect_tool/cli/main.py
@@ -0,0 +1,44 @@
+"""`mkt` command entry point."""
+
+from __future__ import annotations
+
+import json
+from pathlib import Path
+
+import click
+import yaml
+
+from markitect_tool.core import parse_markdown_file
+
+
+@click.group()
+@click.version_option()
+def main() -> None:
+    """Markdown-native toolkit for structured knowledge artifacts."""
+
+
+@main.command()
+@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
+@click.option(
+    "--format",
+    "output_format",
+    type=click.Choice(["json", "yaml", "tree"], case_sensitive=False),
+    default="json",
+    show_default=True,
+)
+def parse(file: Path, output_format: str) -> None:
+    """Parse a Markdown file into a structured representation."""
+
+    document = parse_markdown_file(file)
+    data = document.to_dict()
+    if output_format == "yaml":
+        click.echo(yaml.safe_dump(data, sort_keys=False))
+    elif output_format == "tree":
+        for heading in document.headings:
+            click.echo(f"{'#' * heading.level} {heading.text}")
+    else:
+        click.echo(json.dumps(data, indent=2, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()
--- a/src/markitect_tool/core/init.py
+++ b/src/markitect_tool/core/init.py
@@ -0,0 +1,14 @@
+"""Core markdown parsing and document model."""
+
+from markitect_tool.core.document import ContentBlock, Document, Heading, Section
+from markitect_tool.core.parser import MarkdownParseError, parse_markdown, parse_markdown_file
+
+__all__ = [
+    "ContentBlock",
+    "Document",
+    "Heading",
+    "MarkdownParseError",
+    "Section",
+    "parse_markdown",
+    "parse_markdown_file",
+]
--- a/src/markitect_tool/core/document.py
+++ b/src/markitect_tool/core/document.py
@@ -0,0 +1,72 @@
+"""Structured document model for parsed Markdown."""
+
+from __future__ import annotations
+
+from dataclasses import asdict, dataclass, field
+from typing import Any
+
+
+@dataclass(frozen=True)
+class Heading:
+    """A Markdown heading with source location."""
+
+    level: int
+    text: str
+    line: int
+
+    def to_dict(self) -> dict[str, Any]:
+        return asdict(self)
+
+
+@dataclass(frozen=True)
+class ContentBlock:
+    """A top-level Markdown content block."""
+
+    type: str
+    text: str
+    line_start: int | None = None
+    line_end: int | None = None
+    heading_level: int | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        data = asdict(self)
+        return {key: value for key, value in data.items() if value is not None}
+
+
+@dataclass(frozen=True)
+class Section:
+    """A heading-led section."""
+
+    heading: Heading
+    blocks: list[ContentBlock] = field(default_factory=list)
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "heading": self.heading.to_dict(),
+            "blocks": [block.to_dict() for block in self.blocks],
+        }
+
+
+@dataclass(frozen=True)
+class Document:
+    """Structured representation of a Markdown document."""
+
+    source_path: str | None
+    frontmatter: dict[str, Any]
+    body: str
+    blocks: list[ContentBlock]
+    headings: list[Heading]
+    sections: list[Section]
+    tokens: list[dict[str, Any]]
+
+    def to_dict(self) -> dict[str, Any]:
+        data = {
+            "source_path": self.source_path,
+            "frontmatter": self.frontmatter,
+            "body": self.body,
+            "blocks": [block.to_dict() for block in self.blocks],
+            "headings": [heading.to_dict() for heading in self.headings],
+            "sections": [section.to_dict() for section in self.sections],
+            "tokens": self.tokens,
+        }
+        return {key: value for key, value in data.items() if value is not None}
--- a/src/markitect_tool/core/parser.py
+++ b/src/markitect_tool/core/parser.py
@@ -0,0 +1,182 @@
+"""Markdown parsing into a stable structured representation."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import yaml
+from markdown_it import MarkdownIt
+from markdown_it.token import Token
+
+from markitect_tool.core.document import ContentBlock, Document, Heading, Section
+
+
+class MarkdownParseError(ValueError):
+    """Raised when Markdown metadata cannot be parsed safely."""
+
+
+def parse_markdown_file(path: str | Path) -> Document:
+    """Parse a Markdown file into a structured document."""
+
+    file_path = Path(path)
+    text = file_path.read_text(encoding="utf-8")
+    return parse_markdown(text, source_path=str(file_path))
+
+
+def parse_markdown(markdown: str, source_path: str | None = None) -> Document:
+    """Parse Markdown text into frontmatter, blocks, headings, sections, and tokens."""
+
+    frontmatter, body, body_line_offset = _split_frontmatter(markdown)
+    tokens = _parse_tokens(body)
+    blocks, headings = _blocks_and_headings(tokens, body_line_offset)
+    sections = _sections_from_blocks(blocks, headings)
+    return Document(
+        source_path=source_path,
+        frontmatter=frontmatter,
+        body=body,
+        blocks=blocks,
+        headings=headings,
+        sections=sections,
+        tokens=tokens,
+    )
+
+
+def _split_frontmatter(markdown: str) -> tuple[dict[str, Any], str, int]:
+    if not markdown.startswith("---\n"):
+        return {}, markdown, 0
+
+    end = markdown.find("\n---", 4)
+    if end == -1:
+        return {}, markdown, 0
+
+    closing_end = markdown.find("\n", end + 4)
+    if closing_end == -1:
+        closing_end = len(markdown)
+    else:
+        closing_end += 1
+
+    raw_frontmatter = markdown[4:end]
+    body = markdown[closing_end:]
+    try:
+        data = yaml.safe_load(raw_frontmatter) if raw_frontmatter.strip() else {}
+    except yaml.YAMLError as exc:
+        raise MarkdownParseError(f"Invalid YAML frontmatter: {exc}") from exc
+    if data is None:
+        data = {}
+    if not isinstance(data, dict):
+        raise MarkdownParseError("Frontmatter must be a mapping")
+    body_line_offset = markdown[:closing_end].count("\n")
+    return data, body, body_line_offset
+
+
+def _parse_tokens(markdown: str) -> list[dict[str, Any]]:
+    parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
+    return [_token_to_dict(token) for token in parser.parse(markdown)]
+
+
+def _token_to_dict(token: Token) -> dict[str, Any]:
+    data = {
+        "type": token.type,
+        "tag": token.tag,
+        "attrs": token.attrs,
+        "map": token.map,
+        "nesting": token.nesting,
+        "level": token.level,
+        "children": [_token_to_dict(child) for child in token.children]
+        if token.children
+        else None,
+        "content": token.content,
+        "markup": token.markup,
+        "info": token.info,
+        "meta": token.meta,
+        "block": token.block,
+        "hidden": token.hidden,
+    }
+    return {key: value for key, value in data.items() if value is not None}
+
+
+def _blocks_and_headings(
+    tokens: list[dict[str, Any]], line_offset: int
+) -> tuple[list[ContentBlock], list[Heading]]:
+    blocks: list[ContentBlock] = []
+    headings: list[Heading] = []
+
+    for index, token in enumerate(tokens):
+        token_type = token["type"]
+        if token_type == "heading_open":
+            inline = _next_inline(tokens, index)
+            line_start, line_end = _line_range(token, line_offset)
+            level = int(token.get("tag", "h1").lstrip("h") or "1")
+            text = inline.get("content", "") if inline else ""
+            heading = Heading(level=level, text=text, line=line_start or 1)
+            headings.append(heading)
+            blocks.append(
+                ContentBlock(
+                    type="heading",
+                    text=text,
+                    line_start=line_start,
+                    line_end=line_end,
+                    heading_level=level,
+                )
+            )
+        elif token_type in {"paragraph_open", "bullet_list_open", "ordered_list_open", "blockquote_open", "fence", "code_block", "table_open"}:
+            line_start, line_end = _line_range(token, line_offset)
+            text = token.get("content", "")
+            if not text and token_type.endswith("_open"):
+                inline = _next_inline(tokens, index)
+                text = inline.get("content", "") if inline else ""
+            blocks.append(
+                ContentBlock(
+                    type=_block_type(token_type),
+                    text=text,
+                    line_start=line_start,
+                    line_end=line_end,
+                )
+            )
+
+    return blocks, headings
+
+
+def _next_inline(tokens: list[dict[str, Any]], index: int) -> dict[str, Any] | None:
+    if index + 1 < len(tokens) and tokens[index + 1]["type"] == "inline":
+        return tokens[index + 1]
+    return None
+
+
+def _line_range(token: dict[str, Any], line_offset: int) -> tuple[int | None, int | None]:
+    line_map = token.get("map")
+    if not line_map:
+        return None, None
+    return line_map[0] + line_offset + 1, line_map[1] + line_offset
+
+
+def _block_type(token_type: str) -> str:
+    return {
+        "paragraph_open": "paragraph",
+        "bullet_list_open": "bullet_list",
+        "ordered_list_open": "ordered_list",
+        "blockquote_open": "blockquote",
+        "fence": "code",
+        "code_block": "code",
+        "table_open": "table",
+    }.get(token_type, token_type)
+
+
+def _sections_from_blocks(
+    blocks: list[ContentBlock], headings: list[Heading]
+) -> list[Section]:
+    sections: list[Section] = []
+    current: Section | None = None
+    heading_index = 0
+
+    for block in blocks:
+        if block.type == "heading":
+            heading = headings[heading_index]
+            heading_index += 1
+            current = Section(heading=heading, blocks=[])
+            sections.append(current)
+        elif current is not None:
+            current.blocks.append(block)
+
+    return sections