diff --git a/README.md b/README.md index 43f8849..e243007 100644 --- a/README.md +++ b/README.md @@ -20,3 +20,17 @@ requirements documents in `wiki/`. The repo is registered with the Custodian State Hub as `markitect-tool` under the `markitect` domain. See `docs/state-hub-integration.md`. + +## Development + +Run the tests: + +```bash +python3 -m pytest +``` + +Try the parser CLI from a checkout: + +```bash +PYTHONPATH=src python3 -m markitect_tool parse README.md --format tree +``` diff --git a/docs/packaging-decision.md b/docs/packaging-decision.md new file mode 100644 index 0000000..8bce226 --- /dev/null +++ b/docs/packaging-decision.md @@ -0,0 +1,36 @@ +# Packaging Decision + +Date: 2026-05-03 + +## Decision + +`markitect-tool` starts as a Python 3.12+ package with: + +- Distribution name: `markitect-tool` +- Import package: `markitect_tool` +- CLI entry point: `mkt` +- Build backend: `setuptools` +- Test runner: `pytest` +- Source layout: `src/markitect_tool` + +## Initial Dependencies + +Core dependencies: + +- `markdown-it-py` +- `PyYAML` +- `click>=8.0` + +Optional extras: + +- `query`: `jsonpath-ng` +- `tables`: `tabulate` +- `llm`: `llm-connect` +- `dev`: `pytest` + +## Rationale + +This follows the WP-0002 dependency classification and keeps the first +implementation focused on deterministic markdown parsing and CLI access. The +package name avoids legacy `markitect.*` imports while the `mkt` entry point +matches the PRD. diff --git a/docs/state-hub-integration.md b/docs/state-hub-integration.md index 3009493..c6bc8ce 100644 --- a/docs/state-hub-integration.md +++ b/docs/state-hub-integration.md @@ -32,8 +32,10 @@ workplans/ ## Follow-Up -Once implementation dependencies exist, add an SBOM source and update State Hub -with the SBOM ingestion result. This seed repo currently has no package manifest. +SBOM source: `sbom-tools.yaml`. + +Initial SBOM ingest succeeded on 2026-05-03 with seven declared entries for the +core and optional dependencies. ## Registered Extension Points diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..52ef696 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,40 @@ +[build-system] +requires = ["setuptools>=69"] +build-backend = "setuptools.build_meta" + +[project] +name = "markitect-tool" +version = "0.1.0" +description = "Markdown-native toolkit and CLI for structured knowledge artifacts" +readme = "README.md" +requires-python = ">=3.12" +license = { text = "MIT" } +dependencies = [ + "click>=8.0", + "markdown-it-py", + "PyYAML", +] + +[project.optional-dependencies] +dev = [ + "pytest>=8", +] +query = [ + "jsonpath-ng>=1.5", +] +tables = [ + "tabulate>=0.9", +] +llm = [ + "llm-connect @ file:///home/worsch/llm-connect", +] + +[project.scripts] +mkt = "markitect_tool.cli:main" + +[tool.setuptools.packages.find] +where = ["src"] + +[tool.pytest.ini_options] +testpaths = ["tests"] +pythonpath = ["src"] diff --git a/sbom-tools.yaml b/sbom-tools.yaml new file mode 100644 index 0000000..a39110e --- /dev/null +++ b/sbom-tools.yaml @@ -0,0 +1,29 @@ +tools: + - name: click + ecosystem: python + is_direct: true + is_dev: false + - name: markdown-it-py + ecosystem: python + is_direct: true + is_dev: false + - name: PyYAML + ecosystem: python + is_direct: true + is_dev: false + - name: pytest + ecosystem: python + is_direct: true + is_dev: true + - name: jsonpath-ng + ecosystem: python + is_direct: true + is_dev: false + - name: tabulate + ecosystem: python + is_direct: true + is_dev: false + - name: llm-connect + ecosystem: python + is_direct: true + is_dev: false diff --git a/src/markitect_tool/__init__.py b/src/markitect_tool/__init__.py new file mode 100644 index 0000000..3b18cf9 --- /dev/null +++ b/src/markitect_tool/__init__.py @@ -0,0 +1,21 @@ +"""Structured markdown primitives for markitect-tool.""" + +from markitect_tool.core import ( + ContentBlock, + Document, + Heading, + MarkdownParseError, + Section, + parse_markdown, + parse_markdown_file, +) + +__all__ = [ + "ContentBlock", + "Document", + "Heading", + "MarkdownParseError", + "Section", + "parse_markdown", + "parse_markdown_file", +] diff --git a/src/markitect_tool/__main__.py b/src/markitect_tool/__main__.py new file mode 100644 index 0000000..d507947 --- /dev/null +++ b/src/markitect_tool/__main__.py @@ -0,0 +1,6 @@ +"""Run the `mkt` CLI with `python -m markitect_tool`.""" + +from markitect_tool.cli import main + + +main() diff --git a/src/markitect_tool/cli/__init__.py b/src/markitect_tool/cli/__init__.py new file mode 100644 index 0000000..80262a9 --- /dev/null +++ b/src/markitect_tool/cli/__init__.py @@ -0,0 +1,5 @@ +"""Command-line interface for markitect-tool.""" + +from markitect_tool.cli.main import main + +__all__ = ["main"] diff --git a/src/markitect_tool/cli/main.py b/src/markitect_tool/cli/main.py new file mode 100644 index 0000000..3b52d7a --- /dev/null +++ b/src/markitect_tool/cli/main.py @@ -0,0 +1,44 @@ +"""`mkt` command entry point.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import click +import yaml + +from markitect_tool.core import parse_markdown_file + + +@click.group() +@click.version_option() +def main() -> None: + """Markdown-native toolkit for structured knowledge artifacts.""" + + +@main.command() +@click.argument("file", type=click.Path(exists=True, dir_okay=False, path_type=Path)) +@click.option( + "--format", + "output_format", + type=click.Choice(["json", "yaml", "tree"], case_sensitive=False), + default="json", + show_default=True, +) +def parse(file: Path, output_format: str) -> None: + """Parse a Markdown file into a structured representation.""" + + document = parse_markdown_file(file) + data = document.to_dict() + if output_format == "yaml": + click.echo(yaml.safe_dump(data, sort_keys=False)) + elif output_format == "tree": + for heading in document.headings: + click.echo(f"{'#' * heading.level} {heading.text}") + else: + click.echo(json.dumps(data, indent=2, ensure_ascii=False)) + + +if __name__ == "__main__": + main() diff --git a/src/markitect_tool/core/__init__.py b/src/markitect_tool/core/__init__.py new file mode 100644 index 0000000..3ce841b --- /dev/null +++ b/src/markitect_tool/core/__init__.py @@ -0,0 +1,14 @@ +"""Core markdown parsing and document model.""" + +from markitect_tool.core.document import ContentBlock, Document, Heading, Section +from markitect_tool.core.parser import MarkdownParseError, parse_markdown, parse_markdown_file + +__all__ = [ + "ContentBlock", + "Document", + "Heading", + "MarkdownParseError", + "Section", + "parse_markdown", + "parse_markdown_file", +] diff --git a/src/markitect_tool/core/document.py b/src/markitect_tool/core/document.py new file mode 100644 index 0000000..e749e56 --- /dev/null +++ b/src/markitect_tool/core/document.py @@ -0,0 +1,72 @@ +"""Structured document model for parsed Markdown.""" + +from __future__ import annotations + +from dataclasses import asdict, dataclass, field +from typing import Any + + +@dataclass(frozen=True) +class Heading: + """A Markdown heading with source location.""" + + level: int + text: str + line: int + + def to_dict(self) -> dict[str, Any]: + return asdict(self) + + +@dataclass(frozen=True) +class ContentBlock: + """A top-level Markdown content block.""" + + type: str + text: str + line_start: int | None = None + line_end: int | None = None + heading_level: int | None = None + + def to_dict(self) -> dict[str, Any]: + data = asdict(self) + return {key: value for key, value in data.items() if value is not None} + + +@dataclass(frozen=True) +class Section: + """A heading-led section.""" + + heading: Heading + blocks: list[ContentBlock] = field(default_factory=list) + + def to_dict(self) -> dict[str, Any]: + return { + "heading": self.heading.to_dict(), + "blocks": [block.to_dict() for block in self.blocks], + } + + +@dataclass(frozen=True) +class Document: + """Structured representation of a Markdown document.""" + + source_path: str | None + frontmatter: dict[str, Any] + body: str + blocks: list[ContentBlock] + headings: list[Heading] + sections: list[Section] + tokens: list[dict[str, Any]] + + def to_dict(self) -> dict[str, Any]: + data = { + "source_path": self.source_path, + "frontmatter": self.frontmatter, + "body": self.body, + "blocks": [block.to_dict() for block in self.blocks], + "headings": [heading.to_dict() for heading in self.headings], + "sections": [section.to_dict() for section in self.sections], + "tokens": self.tokens, + } + return {key: value for key, value in data.items() if value is not None} diff --git a/src/markitect_tool/core/parser.py b/src/markitect_tool/core/parser.py new file mode 100644 index 0000000..3ae44a3 --- /dev/null +++ b/src/markitect_tool/core/parser.py @@ -0,0 +1,182 @@ +"""Markdown parsing into a stable structured representation.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import yaml +from markdown_it import MarkdownIt +from markdown_it.token import Token + +from markitect_tool.core.document import ContentBlock, Document, Heading, Section + + +class MarkdownParseError(ValueError): + """Raised when Markdown metadata cannot be parsed safely.""" + + +def parse_markdown_file(path: str | Path) -> Document: + """Parse a Markdown file into a structured document.""" + + file_path = Path(path) + text = file_path.read_text(encoding="utf-8") + return parse_markdown(text, source_path=str(file_path)) + + +def parse_markdown(markdown: str, source_path: str | None = None) -> Document: + """Parse Markdown text into frontmatter, blocks, headings, sections, and tokens.""" + + frontmatter, body, body_line_offset = _split_frontmatter(markdown) + tokens = _parse_tokens(body) + blocks, headings = _blocks_and_headings(tokens, body_line_offset) + sections = _sections_from_blocks(blocks, headings) + return Document( + source_path=source_path, + frontmatter=frontmatter, + body=body, + blocks=blocks, + headings=headings, + sections=sections, + tokens=tokens, + ) + + +def _split_frontmatter(markdown: str) -> tuple[dict[str, Any], str, int]: + if not markdown.startswith("---\n"): + return {}, markdown, 0 + + end = markdown.find("\n---", 4) + if end == -1: + return {}, markdown, 0 + + closing_end = markdown.find("\n", end + 4) + if closing_end == -1: + closing_end = len(markdown) + else: + closing_end += 1 + + raw_frontmatter = markdown[4:end] + body = markdown[closing_end:] + try: + data = yaml.safe_load(raw_frontmatter) if raw_frontmatter.strip() else {} + except yaml.YAMLError as exc: + raise MarkdownParseError(f"Invalid YAML frontmatter: {exc}") from exc + if data is None: + data = {} + if not isinstance(data, dict): + raise MarkdownParseError("Frontmatter must be a mapping") + body_line_offset = markdown[:closing_end].count("\n") + return data, body, body_line_offset + + +def _parse_tokens(markdown: str) -> list[dict[str, Any]]: + parser = MarkdownIt("commonmark", {"tables": True}).enable("table") + return [_token_to_dict(token) for token in parser.parse(markdown)] + + +def _token_to_dict(token: Token) -> dict[str, Any]: + data = { + "type": token.type, + "tag": token.tag, + "attrs": token.attrs, + "map": token.map, + "nesting": token.nesting, + "level": token.level, + "children": [_token_to_dict(child) for child in token.children] + if token.children + else None, + "content": token.content, + "markup": token.markup, + "info": token.info, + "meta": token.meta, + "block": token.block, + "hidden": token.hidden, + } + return {key: value for key, value in data.items() if value is not None} + + +def _blocks_and_headings( + tokens: list[dict[str, Any]], line_offset: int +) -> tuple[list[ContentBlock], list[Heading]]: + blocks: list[ContentBlock] = [] + headings: list[Heading] = [] + + for index, token in enumerate(tokens): + token_type = token["type"] + if token_type == "heading_open": + inline = _next_inline(tokens, index) + line_start, line_end = _line_range(token, line_offset) + level = int(token.get("tag", "h1").lstrip("h") or "1") + text = inline.get("content", "") if inline else "" + heading = Heading(level=level, text=text, line=line_start or 1) + headings.append(heading) + blocks.append( + ContentBlock( + type="heading", + text=text, + line_start=line_start, + line_end=line_end, + heading_level=level, + ) + ) + elif token_type in {"paragraph_open", "bullet_list_open", "ordered_list_open", "blockquote_open", "fence", "code_block", "table_open"}: + line_start, line_end = _line_range(token, line_offset) + text = token.get("content", "") + if not text and token_type.endswith("_open"): + inline = _next_inline(tokens, index) + text = inline.get("content", "") if inline else "" + blocks.append( + ContentBlock( + type=_block_type(token_type), + text=text, + line_start=line_start, + line_end=line_end, + ) + ) + + return blocks, headings + + +def _next_inline(tokens: list[dict[str, Any]], index: int) -> dict[str, Any] | None: + if index + 1 < len(tokens) and tokens[index + 1]["type"] == "inline": + return tokens[index + 1] + return None + + +def _line_range(token: dict[str, Any], line_offset: int) -> tuple[int | None, int | None]: + line_map = token.get("map") + if not line_map: + return None, None + return line_map[0] + line_offset + 1, line_map[1] + line_offset + + +def _block_type(token_type: str) -> str: + return { + "paragraph_open": "paragraph", + "bullet_list_open": "bullet_list", + "ordered_list_open": "ordered_list", + "blockquote_open": "blockquote", + "fence": "code", + "code_block": "code", + "table_open": "table", + }.get(token_type, token_type) + + +def _sections_from_blocks( + blocks: list[ContentBlock], headings: list[Heading] +) -> list[Section]: + sections: list[Section] = [] + current: Section | None = None + heading_index = 0 + + for block in blocks: + if block.type == "heading": + heading = headings[heading_index] + heading_index += 1 + current = Section(heading=heading, blocks=[]) + sections.append(current) + elif current is not None: + current.blocks.append(block) + + return sections diff --git a/tests/test_parse_contract.py b/tests/test_parse_contract.py new file mode 100644 index 0000000..5a20ae8 --- /dev/null +++ b/tests/test_parse_contract.py @@ -0,0 +1,89 @@ +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from markitect_tool import MarkdownParseError, parse_markdown, parse_markdown_file +from markitect_tool.cli import main + + +def test_parse_markdown_preserves_headings_and_paragraphs(): + document = parse_markdown("# Heading\n\nThis is a paragraph.") + + assert document.frontmatter == {} + assert document.headings[0].level == 1 + assert document.headings[0].text == "Heading" + assert [block.type for block in document.blocks] == ["heading", "paragraph"] + assert document.sections[0].heading.text == "Heading" + assert document.sections[0].blocks[0].text == "This is a paragraph." + assert document.tokens[0]["type"] == "heading_open" + + +def test_parse_markdown_extracts_yaml_frontmatter(): + markdown = """--- +title: YAML Frontmatter Test Document +tags: + - yaml + - frontmatter +published: true +nested: + priority: high +--- + +# YAML Frontmatter Test Document + +Body text. +""" + + document = parse_markdown(markdown) + + assert document.frontmatter["title"] == "YAML Frontmatter Test Document" + assert document.frontmatter["tags"] == ["yaml", "frontmatter"] + assert document.frontmatter["published"] is True + assert document.frontmatter["nested"]["priority"] == "high" + assert document.headings[0].line == 11 + assert document.body.lstrip().startswith("# YAML Frontmatter Test Document") + + +def test_parse_markdown_without_frontmatter_is_graceful(): + document = parse_markdown("# Document Without Frontmatter\n\nText.") + + assert document.frontmatter == {} + assert document.headings[0].text == "Document Without Frontmatter" + + +def test_parse_markdown_rejects_non_mapping_frontmatter(): + with pytest.raises(MarkdownParseError, match="Frontmatter must be a mapping"): + parse_markdown("---\n- nope\n---\n\n# Bad") + + +def test_parse_markdown_file_records_source_path(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Test Document\n\nBody", encoding="utf-8") + + document = parse_markdown_file(source) + + assert document.source_path == str(source) + assert document.headings[0].text == "Test Document" + + +def test_mkt_parse_outputs_json(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# Test Document\n\nBody", encoding="utf-8") + + result = CliRunner().invoke(main, ["parse", str(source)]) + + assert result.exit_code == 0 + assert '"headings"' in result.output + assert "Test Document" in result.output + + +def test_mkt_parse_outputs_tree(tmp_path: Path): + source = tmp_path / "doc.md" + source.write_text("# One\n\n## Two\n", encoding="utf-8") + + result = CliRunner().invoke(main, ["parse", str(source), "--format", "tree"]) + + assert result.exit_code == 0 + assert "# One" in result.output + assert "## Two" in result.output diff --git a/workplans/MKTT-WP-0001-repo-foundation.md b/workplans/MKTT-WP-0001-repo-foundation.md index d64ac07..ea26aa8 100644 --- a/workplans/MKTT-WP-0001-repo-foundation.md +++ b/workplans/MKTT-WP-0001-repo-foundation.md @@ -58,7 +58,7 @@ migration assessment, and implementation. ```task id: MKTT-WP-0001-T004 -status: todo +status: done priority: medium state_hub_task_id: "c15f8492-93d0-43aa-ba12-0d4aaff97c03" ``` @@ -67,11 +67,13 @@ Choose package/module names, Python version target, dependency manager, and test runner. Keep the decision lightweight and aligned with the future `mkt` CLI entry point. +Output: `docs/packaging-decision.md`. + ## P1.5 - Add SBOM source once manifests exist ```task id: MKTT-WP-0001-T005 -status: blocked +status: done priority: medium state_hub_task_id: "e77a5e46-aaa2-4717-922f-a871fa2fd1cc" ``` @@ -79,4 +81,4 @@ state_hub_task_id: "e77a5e46-aaa2-4717-922f-a871fa2fd1cc" After packaging files are introduced, generate or identify the SBOM source and update State Hub registration metadata. -Blocked because the repository has no implementation package manifest yet. +Output: `sbom-tools.yaml`; initial State Hub ingest succeeded on 2026-05-03. diff --git a/workplans/MKTT-WP-0003-core-toolkit-implementation.md b/workplans/MKTT-WP-0003-core-toolkit-implementation.md index c771be6..da3a0f6 100644 --- a/workplans/MKTT-WP-0003-core-toolkit-implementation.md +++ b/workplans/MKTT-WP-0003-core-toolkit-implementation.md @@ -22,7 +22,7 @@ contract and the `mkt` CLI. ```task id: MKTT-WP-0003-T001 -status: todo +status: done priority: high state_hub_task_id: "9d9501fe-6809-4b4f-bda6-ec5e5952ddc7" ``` @@ -30,11 +30,13 @@ state_hub_task_id: "9d9501fe-6809-4b4f-bda6-ec5e5952ddc7" Create project metadata, package layout, test structure, and a minimal CLI entry point that can be installed or run locally. +Output: `pyproject.toml`, `src/markitect_tool/`, `tests/`. + ## P3.2 - Implement structured markdown parse contract ```task id: MKTT-WP-0003-T002 -status: todo +status: done priority: high state_hub_task_id: "7dead033-e249-46b0-9eb3-908ae231a987" ``` @@ -43,6 +45,9 @@ Implement FR-001 and FR-002: parse markdown files, preserve headings, frontmatter, sections, and content blocks, and expose structured output via API and CLI. +Initial implementation complete for Markdown files, YAML frontmatter, headings, +sections, content blocks, parser tokens, API access, and `mkt parse`. + ## P3.3 - Implement schema load and validation ```task @@ -108,10 +113,12 @@ Implement FR-070 and FR-071 after the parse/schema contracts are stable. ```task id: MKTT-WP-0003-T008 -status: todo +status: done priority: medium state_hub_task_id: "94067c7e-e68b-45be-a1d6-90547eb15422" ``` Resolve `TD-MKTT-001` by adding the implementation scaffold: package metadata, module layout, test runner, and `mkt` CLI entry point. + +Resolved by the initial package scaffold.