generated from coulomb/repo-seed
Parse Markdown files into a structured Python object
This commit is contained in:
21
src/markitect_tool/__init__.py
Normal file
21
src/markitect_tool/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""Structured markdown primitives for markitect-tool."""
|
||||
|
||||
from markitect_tool.core import (
|
||||
ContentBlock,
|
||||
Document,
|
||||
Heading,
|
||||
MarkdownParseError,
|
||||
Section,
|
||||
parse_markdown,
|
||||
parse_markdown_file,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ContentBlock",
|
||||
"Document",
|
||||
"Heading",
|
||||
"MarkdownParseError",
|
||||
"Section",
|
||||
"parse_markdown",
|
||||
"parse_markdown_file",
|
||||
]
|
||||
6
src/markitect_tool/__main__.py
Normal file
6
src/markitect_tool/__main__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""Run the `mkt` CLI with `python -m markitect_tool`."""
|
||||
|
||||
from markitect_tool.cli import main
|
||||
|
||||
|
||||
main()
|
||||
5
src/markitect_tool/cli/__init__.py
Normal file
5
src/markitect_tool/cli/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Command-line interface for markitect-tool."""
|
||||
|
||||
from markitect_tool.cli.main import main
|
||||
|
||||
__all__ = ["main"]
|
||||
44
src/markitect_tool/cli/main.py
Normal file
44
src/markitect_tool/cli/main.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""`mkt` command entry point."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import click
|
||||
import yaml
|
||||
|
||||
from markitect_tool.core import parse_markdown_file
|
||||
|
||||
|
||||
@click.group()
|
||||
@click.version_option()
|
||||
def main() -> None:
|
||||
"""Markdown-native toolkit for structured knowledge artifacts."""
|
||||
|
||||
|
||||
@main.command()
|
||||
@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path))
|
||||
@click.option(
|
||||
"--format",
|
||||
"output_format",
|
||||
type=click.Choice(["json", "yaml", "tree"], case_sensitive=False),
|
||||
default="json",
|
||||
show_default=True,
|
||||
)
|
||||
def parse(file: Path, output_format: str) -> None:
|
||||
"""Parse a Markdown file into a structured representation."""
|
||||
|
||||
document = parse_markdown_file(file)
|
||||
data = document.to_dict()
|
||||
if output_format == "yaml":
|
||||
click.echo(yaml.safe_dump(data, sort_keys=False))
|
||||
elif output_format == "tree":
|
||||
for heading in document.headings:
|
||||
click.echo(f"{'#' * heading.level} {heading.text}")
|
||||
else:
|
||||
click.echo(json.dumps(data, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
14
src/markitect_tool/core/__init__.py
Normal file
14
src/markitect_tool/core/__init__.py
Normal file
@@ -0,0 +1,14 @@
|
||||
"""Core markdown parsing and document model."""
|
||||
|
||||
from markitect_tool.core.document import ContentBlock, Document, Heading, Section
|
||||
from markitect_tool.core.parser import MarkdownParseError, parse_markdown, parse_markdown_file
|
||||
|
||||
__all__ = [
|
||||
"ContentBlock",
|
||||
"Document",
|
||||
"Heading",
|
||||
"MarkdownParseError",
|
||||
"Section",
|
||||
"parse_markdown",
|
||||
"parse_markdown_file",
|
||||
]
|
||||
72
src/markitect_tool/core/document.py
Normal file
72
src/markitect_tool/core/document.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Structured document model for parsed Markdown."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import asdict, dataclass, field
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Heading:
|
||||
"""A Markdown heading with source location."""
|
||||
|
||||
level: int
|
||||
text: str
|
||||
line: int
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ContentBlock:
|
||||
"""A top-level Markdown content block."""
|
||||
|
||||
type: str
|
||||
text: str
|
||||
line_start: int | None = None
|
||||
line_end: int | None = None
|
||||
heading_level: int | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
data = asdict(self)
|
||||
return {key: value for key, value in data.items() if value is not None}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Section:
|
||||
"""A heading-led section."""
|
||||
|
||||
heading: Heading
|
||||
blocks: list[ContentBlock] = field(default_factory=list)
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"heading": self.heading.to_dict(),
|
||||
"blocks": [block.to_dict() for block in self.blocks],
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Document:
|
||||
"""Structured representation of a Markdown document."""
|
||||
|
||||
source_path: str | None
|
||||
frontmatter: dict[str, Any]
|
||||
body: str
|
||||
blocks: list[ContentBlock]
|
||||
headings: list[Heading]
|
||||
sections: list[Section]
|
||||
tokens: list[dict[str, Any]]
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
data = {
|
||||
"source_path": self.source_path,
|
||||
"frontmatter": self.frontmatter,
|
||||
"body": self.body,
|
||||
"blocks": [block.to_dict() for block in self.blocks],
|
||||
"headings": [heading.to_dict() for heading in self.headings],
|
||||
"sections": [section.to_dict() for section in self.sections],
|
||||
"tokens": self.tokens,
|
||||
}
|
||||
return {key: value for key, value in data.items() if value is not None}
|
||||
182
src/markitect_tool/core/parser.py
Normal file
182
src/markitect_tool/core/parser.py
Normal file
@@ -0,0 +1,182 @@
|
||||
"""Markdown parsing into a stable structured representation."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
from markdown_it import MarkdownIt
|
||||
from markdown_it.token import Token
|
||||
|
||||
from markitect_tool.core.document import ContentBlock, Document, Heading, Section
|
||||
|
||||
|
||||
class MarkdownParseError(ValueError):
|
||||
"""Raised when Markdown metadata cannot be parsed safely."""
|
||||
|
||||
|
||||
def parse_markdown_file(path: str | Path) -> Document:
|
||||
"""Parse a Markdown file into a structured document."""
|
||||
|
||||
file_path = Path(path)
|
||||
text = file_path.read_text(encoding="utf-8")
|
||||
return parse_markdown(text, source_path=str(file_path))
|
||||
|
||||
|
||||
def parse_markdown(markdown: str, source_path: str | None = None) -> Document:
|
||||
"""Parse Markdown text into frontmatter, blocks, headings, sections, and tokens."""
|
||||
|
||||
frontmatter, body, body_line_offset = _split_frontmatter(markdown)
|
||||
tokens = _parse_tokens(body)
|
||||
blocks, headings = _blocks_and_headings(tokens, body_line_offset)
|
||||
sections = _sections_from_blocks(blocks, headings)
|
||||
return Document(
|
||||
source_path=source_path,
|
||||
frontmatter=frontmatter,
|
||||
body=body,
|
||||
blocks=blocks,
|
||||
headings=headings,
|
||||
sections=sections,
|
||||
tokens=tokens,
|
||||
)
|
||||
|
||||
|
||||
def _split_frontmatter(markdown: str) -> tuple[dict[str, Any], str, int]:
|
||||
if not markdown.startswith("---\n"):
|
||||
return {}, markdown, 0
|
||||
|
||||
end = markdown.find("\n---", 4)
|
||||
if end == -1:
|
||||
return {}, markdown, 0
|
||||
|
||||
closing_end = markdown.find("\n", end + 4)
|
||||
if closing_end == -1:
|
||||
closing_end = len(markdown)
|
||||
else:
|
||||
closing_end += 1
|
||||
|
||||
raw_frontmatter = markdown[4:end]
|
||||
body = markdown[closing_end:]
|
||||
try:
|
||||
data = yaml.safe_load(raw_frontmatter) if raw_frontmatter.strip() else {}
|
||||
except yaml.YAMLError as exc:
|
||||
raise MarkdownParseError(f"Invalid YAML frontmatter: {exc}") from exc
|
||||
if data is None:
|
||||
data = {}
|
||||
if not isinstance(data, dict):
|
||||
raise MarkdownParseError("Frontmatter must be a mapping")
|
||||
body_line_offset = markdown[:closing_end].count("\n")
|
||||
return data, body, body_line_offset
|
||||
|
||||
|
||||
def _parse_tokens(markdown: str) -> list[dict[str, Any]]:
|
||||
parser = MarkdownIt("commonmark", {"tables": True}).enable("table")
|
||||
return [_token_to_dict(token) for token in parser.parse(markdown)]
|
||||
|
||||
|
||||
def _token_to_dict(token: Token) -> dict[str, Any]:
|
||||
data = {
|
||||
"type": token.type,
|
||||
"tag": token.tag,
|
||||
"attrs": token.attrs,
|
||||
"map": token.map,
|
||||
"nesting": token.nesting,
|
||||
"level": token.level,
|
||||
"children": [_token_to_dict(child) for child in token.children]
|
||||
if token.children
|
||||
else None,
|
||||
"content": token.content,
|
||||
"markup": token.markup,
|
||||
"info": token.info,
|
||||
"meta": token.meta,
|
||||
"block": token.block,
|
||||
"hidden": token.hidden,
|
||||
}
|
||||
return {key: value for key, value in data.items() if value is not None}
|
||||
|
||||
|
||||
def _blocks_and_headings(
|
||||
tokens: list[dict[str, Any]], line_offset: int
|
||||
) -> tuple[list[ContentBlock], list[Heading]]:
|
||||
blocks: list[ContentBlock] = []
|
||||
headings: list[Heading] = []
|
||||
|
||||
for index, token in enumerate(tokens):
|
||||
token_type = token["type"]
|
||||
if token_type == "heading_open":
|
||||
inline = _next_inline(tokens, index)
|
||||
line_start, line_end = _line_range(token, line_offset)
|
||||
level = int(token.get("tag", "h1").lstrip("h") or "1")
|
||||
text = inline.get("content", "") if inline else ""
|
||||
heading = Heading(level=level, text=text, line=line_start or 1)
|
||||
headings.append(heading)
|
||||
blocks.append(
|
||||
ContentBlock(
|
||||
type="heading",
|
||||
text=text,
|
||||
line_start=line_start,
|
||||
line_end=line_end,
|
||||
heading_level=level,
|
||||
)
|
||||
)
|
||||
elif token_type in {"paragraph_open", "bullet_list_open", "ordered_list_open", "blockquote_open", "fence", "code_block", "table_open"}:
|
||||
line_start, line_end = _line_range(token, line_offset)
|
||||
text = token.get("content", "")
|
||||
if not text and token_type.endswith("_open"):
|
||||
inline = _next_inline(tokens, index)
|
||||
text = inline.get("content", "") if inline else ""
|
||||
blocks.append(
|
||||
ContentBlock(
|
||||
type=_block_type(token_type),
|
||||
text=text,
|
||||
line_start=line_start,
|
||||
line_end=line_end,
|
||||
)
|
||||
)
|
||||
|
||||
return blocks, headings
|
||||
|
||||
|
||||
def _next_inline(tokens: list[dict[str, Any]], index: int) -> dict[str, Any] | None:
|
||||
if index + 1 < len(tokens) and tokens[index + 1]["type"] == "inline":
|
||||
return tokens[index + 1]
|
||||
return None
|
||||
|
||||
|
||||
def _line_range(token: dict[str, Any], line_offset: int) -> tuple[int | None, int | None]:
|
||||
line_map = token.get("map")
|
||||
if not line_map:
|
||||
return None, None
|
||||
return line_map[0] + line_offset + 1, line_map[1] + line_offset
|
||||
|
||||
|
||||
def _block_type(token_type: str) -> str:
|
||||
return {
|
||||
"paragraph_open": "paragraph",
|
||||
"bullet_list_open": "bullet_list",
|
||||
"ordered_list_open": "ordered_list",
|
||||
"blockquote_open": "blockquote",
|
||||
"fence": "code",
|
||||
"code_block": "code",
|
||||
"table_open": "table",
|
||||
}.get(token_type, token_type)
|
||||
|
||||
|
||||
def _sections_from_blocks(
|
||||
blocks: list[ContentBlock], headings: list[Heading]
|
||||
) -> list[Section]:
|
||||
sections: list[Section] = []
|
||||
current: Section | None = None
|
||||
heading_index = 0
|
||||
|
||||
for block in blocks:
|
||||
if block.type == "heading":
|
||||
heading = headings[heading_index]
|
||||
heading_index += 1
|
||||
current = Section(heading=heading, blocks=[])
|
||||
sections.append(current)
|
||||
elif current is not None:
|
||||
current.blocks.append(block)
|
||||
|
||||
return sections
|
||||
Reference in New Issue
Block a user