Parse Markdown files into a structured Python object

This commit is contained in:
2026-05-03 21:37:00 +02:00
parent 2676994b11
commit 705f2c6178
15 changed files with 571 additions and 8 deletions

View File

@@ -0,0 +1,21 @@
"""Structured markdown primitives for markitect-tool."""
from markitect_tool.core import (
ContentBlock,
Document,
Heading,
MarkdownParseError,
Section,
parse_markdown,
parse_markdown_file,
)
__all__ = [
"ContentBlock",
"Document",
"Heading",
"MarkdownParseError",
"Section",
"parse_markdown",
"parse_markdown_file",
]

View File

@@ -0,0 +1,6 @@
"""Run the `mkt` CLI with `python -m markitect_tool`."""
from markitect_tool.cli import main
main()

View File

@@ -0,0 +1,5 @@
"""Command-line interface for markitect-tool."""
from markitect_tool.cli.main import main
__all__ = ["main"]

View File

@@ -0,0 +1,44 @@
"""`mkt` command entry point."""
from __future__ import annotations
import json
from pathlib import Path
import click
import yaml
from markitect_tool.core import parse_markdown_file
@click.group()
@click.version_option()
def main() -> None:
"""Markdown-native toolkit for structured knowledge artifacts."""
@main.command()
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
@click.option(
"--format",
"output_format",
type=click.Choice(["json", "yaml", "tree"], case_sensitive=False),
default="json",
show_default=True,
)
def parse(file: Path, output_format: str) -> None:
"""Parse a Markdown file into a structured representation."""
document = parse_markdown_file(file)
data = document.to_dict()
if output_format == "yaml":
click.echo(yaml.safe_dump(data, sort_keys=False))
elif output_format == "tree":
for heading in document.headings:
click.echo(f"{'#' * heading.level} {heading.text}")
else:
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,14 @@
"""Core markdown parsing and document model."""
from markitect_tool.core.document import ContentBlock, Document, Heading, Section
from markitect_tool.core.parser import MarkdownParseError, parse_markdown, parse_markdown_file
__all__ = [
"ContentBlock",
"Document",
"Heading",
"MarkdownParseError",
"Section",
"parse_markdown",
"parse_markdown_file",
]

View File

@@ -0,0 +1,72 @@
"""Structured document model for parsed Markdown."""
from __future__ import annotations
from dataclasses import asdict, dataclass, field
from typing import Any
@dataclass(frozen=True)
class Heading:
"""A Markdown heading with source location."""
level: int
text: str
line: int
def to_dict(self) -> dict[str, Any]:
return asdict(self)
@dataclass(frozen=True)
class ContentBlock:
"""A top-level Markdown content block."""
type: str
text: str
line_start: int | None = None
line_end: int | None = None
heading_level: int | None = None
def to_dict(self) -> dict[str, Any]:
data = asdict(self)
return {key: value for key, value in data.items() if value is not None}
@dataclass(frozen=True)
class Section:
"""A heading-led section."""
heading: Heading
blocks: list[ContentBlock] = field(default_factory=list)
def to_dict(self) -> dict[str, Any]:
return {
"heading": self.heading.to_dict(),
"blocks": [block.to_dict() for block in self.blocks],
}
@dataclass(frozen=True)
class Document:
"""Structured representation of a Markdown document."""
source_path: str | None
frontmatter: dict[str, Any]
body: str
blocks: list[ContentBlock]
headings: list[Heading]
sections: list[Section]
tokens: list[dict[str, Any]]
def to_dict(self) -> dict[str, Any]:
data = {
"source_path": self.source_path,
"frontmatter": self.frontmatter,
"body": self.body,
"blocks": [block.to_dict() for block in self.blocks],
"headings": [heading.to_dict() for heading in self.headings],
"sections": [section.to_dict() for section in self.sections],
"tokens": self.tokens,
}
return {key: value for key, value in data.items() if value is not None}

View File

@@ -0,0 +1,182 @@
"""Markdown parsing into a stable structured representation."""
from __future__ import annotations
from pathlib import Path
from typing import Any
import yaml
from markdown_it import MarkdownIt
from markdown_it.token import Token
from markitect_tool.core.document import ContentBlock, Document, Heading, Section
class MarkdownParseError(ValueError):
"""Raised when Markdown metadata cannot be parsed safely."""
def parse_markdown_file(path: str | Path) -> Document:
"""Parse a Markdown file into a structured document."""
file_path = Path(path)
text = file_path.read_text(encoding="utf-8")
return parse_markdown(text, source_path=str(file_path))
def parse_markdown(markdown: str, source_path: str | None = None) -> Document:
"""Parse Markdown text into frontmatter, blocks, headings, sections, and tokens."""
frontmatter, body, body_line_offset = _split_frontmatter(markdown)
tokens = _parse_tokens(body)
blocks, headings = _blocks_and_headings(tokens, body_line_offset)
sections = _sections_from_blocks(blocks, headings)
return Document(
source_path=source_path,
frontmatter=frontmatter,
body=body,
blocks=blocks,
headings=headings,
sections=sections,
tokens=tokens,
)
def _split_frontmatter(markdown: str) -> tuple[dict[str, Any], str, int]:
if not markdown.startswith("---\n"):
return {}, markdown, 0
end = markdown.find("\n---", 4)
if end == -1:
return {}, markdown, 0
closing_end = markdown.find("\n", end + 4)
if closing_end == -1:
closing_end = len(markdown)
else:
closing_end += 1
raw_frontmatter = markdown[4:end]
body = markdown[closing_end:]
try:
data = yaml.safe_load(raw_frontmatter) if raw_frontmatter.strip() else {}
except yaml.YAMLError as exc:
raise MarkdownParseError(f"Invalid YAML frontmatter: {exc}") from exc
if data is None:
data = {}
if not isinstance(data, dict):
raise MarkdownParseError("Frontmatter must be a mapping")
body_line_offset = markdown[:closing_end].count("\n")
return data, body, body_line_offset
def _parse_tokens(markdown: str) -> list[dict[str, Any]]:
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
return [_token_to_dict(token) for token in parser.parse(markdown)]
def _token_to_dict(token: Token) -> dict[str, Any]:
data = {
"type": token.type,
"tag": token.tag,
"attrs": token.attrs,
"map": token.map,
"nesting": token.nesting,
"level": token.level,
"children": [_token_to_dict(child) for child in token.children]
if token.children
else None,
"content": token.content,
"markup": token.markup,
"info": token.info,
"meta": token.meta,
"block": token.block,
"hidden": token.hidden,
}
return {key: value for key, value in data.items() if value is not None}
def _blocks_and_headings(
tokens: list[dict[str, Any]], line_offset: int
) -> tuple[list[ContentBlock], list[Heading]]:
blocks: list[ContentBlock] = []
headings: list[Heading] = []
for index, token in enumerate(tokens):
token_type = token["type"]
if token_type == "heading_open":
inline = _next_inline(tokens, index)
line_start, line_end = _line_range(token, line_offset)
level = int(token.get("tag", "h1").lstrip("h") or "1")
text = inline.get("content", "") if inline else ""
heading = Heading(level=level, text=text, line=line_start or 1)
headings.append(heading)
blocks.append(
ContentBlock(
type="heading",
text=text,
line_start=line_start,
line_end=line_end,
heading_level=level,
)
)
elif token_type in {"paragraph_open", "bullet_list_open", "ordered_list_open", "blockquote_open", "fence", "code_block", "table_open"}:
line_start, line_end = _line_range(token, line_offset)
text = token.get("content", "")
if not text and token_type.endswith("_open"):
inline = _next_inline(tokens, index)
text = inline.get("content", "") if inline else ""
blocks.append(
ContentBlock(
type=_block_type(token_type),
text=text,
line_start=line_start,
line_end=line_end,
)
)
return blocks, headings
def _next_inline(tokens: list[dict[str, Any]], index: int) -> dict[str, Any] | None:
if index + 1 < len(tokens) and tokens[index + 1]["type"] == "inline":
return tokens[index + 1]
return None
def _line_range(token: dict[str, Any], line_offset: int) -> tuple[int | None, int | None]:
line_map = token.get("map")
if not line_map:
return None, None
return line_map[0] + line_offset + 1, line_map[1] + line_offset
def _block_type(token_type: str) -> str:
return {
"paragraph_open": "paragraph",
"bullet_list_open": "bullet_list",
"ordered_list_open": "ordered_list",
"blockquote_open": "blockquote",
"fence": "code",
"code_block": "code",
"table_open": "table",
}.get(token_type, token_type)
def _sections_from_blocks(
blocks: list[ContentBlock], headings: list[Heading]
) -> list[Section]:
sections: list[Section] = []
current: Section | None = None
heading_index = 0
for block in blocks:
if block.type == "heading":
heading = headings[heading_index]
heading_index += 1
current = Section(heading=heading, blocks=[])
sections.append(current)
elif current is not None:
current.blocks.append(block)
return sections