diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..77fcc507 --- /dev/null +++ b/.gitignore @@ -0,0 +1,73 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Virtual environments +venv/ +.env/ +.venv/ +env/ +ENV/ + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Unit test / coverage reports +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +*.hypothesis/ +.pytest_cache/ + +# Jupyter Notebook +.ipynb_checkpoints + +# PEP 582 (used by eg. pipx) +__pypackages__/ + +# PyCharm / VS Code settings +.idea/ +.vscode/ + +# Local config +*.env +*.ini +*.toml +*.cfg + +# Logs +*.log + +# Mac/Linux/Windows system files +.DS_Store +Thumbs.db + diff --git a/markitect/__init__.py b/markitect/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/markitect/parser.py b/markitect/parser.py new file mode 100644 index 00000000..116720db --- /dev/null +++ b/markitect/parser.py @@ -0,0 +1,25 @@ +from markdown_it import MarkdownIt + +def parse_markdown_to_ast(md_content: str): + md = MarkdownIt() + tokens = md.parse(md_content) + # Convert to a JSON-serializable list of dicts (tokens are objects, so we dict-ify them recursively) + def token_to_dict(token): + d = { + 'type': token.type, + 'tag': token.tag, + 'attrs': token.attrs, + 'map': token.map, + 'nesting': token.nesting, + 'level': token.level, + 'children': [token_to_dict(child) if child else None for child in token.children] if token.children else None, + 'content': token.content, + 'markup': token.markup, + 'info': token.info, + 'meta': token.meta, + 'block': token.block, + 'hidden': token.hidden + } + return {k: v for k, v in d.items() if v is not None} # Remove None values for cleanliness + + return [token_to_dict(token) for token in tokens] diff --git a/tests/test_parser.py b/tests/test_parser.py new file mode 100644 index 00000000..18ab44cc --- /dev/null +++ b/tests/test_parser.py @@ -0,0 +1,120 @@ +import pytest +from markitect.parser import parse_markdown_to_ast + +def test_parse_basic_markdown(): + md_content = "# Heading\nThis is a paragraph." + expected_ast = [ + { + 'type': 'heading_open', + 'tag': 'h1', + 'attrs': {}, + 'map': [0, 1], + 'nesting': 1, + 'level': 0, + 'content': '', + 'markup': '#', + 'info': '', + 'meta': {}, + 'block': True, + 'hidden': False + }, + { + 'type': 'inline', + 'tag': '', + 'attrs': {}, + 'map': [0, 1], + 'nesting': 0, + 'level': 1, + 'children': [ + { + 'type': 'text', + 'tag': '', + 'attrs': {}, + 'nesting': 0, + 'level': 0, + 'content': 'Heading', + 'markup': '', + 'info': '', + 'meta': {}, + 'block': False, + 'hidden': False + } + ], + 'content': 'Heading', + 'markup': '', + 'info': '', + 'meta': {}, + 'block': True, + 'hidden': False + }, + { + 'type': 'heading_close', + 'tag': 'h1', + 'attrs': {}, + 'nesting': -1, + 'level': 0, + 'content': '', + 'markup': '#', + 'info': '', + 'meta': {}, + 'block': True, + 'hidden': False + }, + { + 'type': 'paragraph_open', + 'tag': 'p', + 'attrs': {}, + 'map': [1, 2], + 'nesting': 1, + 'level': 0, + 'content': '', + 'markup': '', + 'info': '', + 'meta': {}, + 'block': True, + 'hidden': False + }, + { + 'type': 'inline', + 'tag': '', + 'attrs': {}, + 'map': [1, 2], + 'nesting': 0, + 'level': 1, + 'children': [ + { + 'type': 'text', + 'tag': '', + 'attrs': {}, + 'nesting': 0, + 'level': 0, + 'content': 'This is a paragraph.', + 'markup': '', + 'info': '', + 'meta': {}, + 'block': False, + 'hidden': False + } + ], + 'content': 'This is a paragraph.', + 'markup': '', + 'info': '', + 'meta': {}, + 'block': True, + 'hidden': False + }, + { + 'type': 'paragraph_close', + 'tag': 'p', + 'attrs': {}, + 'nesting': -1, + 'level': 0, + 'content': '', + 'markup': '', + 'info': '', + 'meta': {}, + 'block': True, + 'hidden': False + } +] + assert parse_markdown_to_ast(md_content) == expected_ast