Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
138 lines
4.3 KiB
Python
138 lines
4.3 KiB
Python
"""Tests for markitect.core.section_tree."""
|
|
|
|
from markitect.core.parser import parse_markdown_to_ast
|
|
from markitect.core.section_tree import (
|
|
build_section_tree,
|
|
extract_heading_content,
|
|
extract_heading_level,
|
|
extract_section_text,
|
|
slugify,
|
|
)
|
|
|
|
|
|
class TestSlugify:
|
|
def test_simple_text(self):
|
|
assert slugify("Hello World") == "hello_world"
|
|
|
|
def test_german_umlauts(self):
|
|
assert slugify("Ärger mit Über") == "aerger_mit_ueber"
|
|
|
|
def test_special_characters(self):
|
|
assert slugify("Smith's Original Wording") == "smith_s_original_wording"
|
|
|
|
def test_empty_string(self):
|
|
assert slugify("") == "feld"
|
|
|
|
def test_trailing_underscores_stripped(self):
|
|
assert slugify("--hello--") == "hello"
|
|
|
|
def test_multiple_spaces(self):
|
|
assert slugify("a b") == "a_b"
|
|
|
|
|
|
class TestExtractHeadingLevel:
|
|
def test_h1(self):
|
|
assert extract_heading_level("h1") == 1
|
|
|
|
def test_h6(self):
|
|
assert extract_heading_level("h6") == 6
|
|
|
|
def test_invalid_tag(self):
|
|
assert extract_heading_level("p") == 1
|
|
|
|
def test_empty(self):
|
|
assert extract_heading_level("") == 1
|
|
|
|
|
|
class TestExtractHeadingContent:
|
|
def test_finds_inline_token(self):
|
|
tokens = [
|
|
{"type": "heading_open", "tag": "h1"},
|
|
{"type": "inline", "content": "Hello"},
|
|
{"type": "heading_close", "tag": "h1"},
|
|
]
|
|
assert extract_heading_content(tokens, 0) == "Hello"
|
|
|
|
def test_no_inline(self):
|
|
tokens = [
|
|
{"type": "heading_open", "tag": "h1"},
|
|
{"type": "heading_close", "tag": "h1"},
|
|
]
|
|
assert extract_heading_content(tokens, 0) == ""
|
|
|
|
|
|
class TestBuildSectionTree:
|
|
def test_single_heading(self):
|
|
md = "# Title\n\nSome text."
|
|
tokens = parse_markdown_to_ast(md)
|
|
tree = build_section_tree(tokens)
|
|
|
|
assert tree["level"] == 0
|
|
assert len(tree["children"]) == 1
|
|
assert tree["children"][0]["heading"] == "Title"
|
|
assert tree["children"][0]["level"] == 1
|
|
|
|
def test_nested_headings(self):
|
|
md = "# Top\n\n## Sub\n\ntext\n\n## Sub2\n\nmore"
|
|
tokens = parse_markdown_to_ast(md)
|
|
tree = build_section_tree(tokens)
|
|
|
|
top = tree["children"][0]
|
|
assert top["heading"] == "Top"
|
|
assert len(top["children"]) == 2
|
|
assert top["children"][0]["heading"] == "Sub"
|
|
assert top["children"][1]["heading"] == "Sub2"
|
|
|
|
def test_max_depth(self):
|
|
md = "# Top\n\n## Sub\n\n### Deep\n\ntext"
|
|
tokens = parse_markdown_to_ast(md)
|
|
tree = build_section_tree(tokens, max_depth=2)
|
|
|
|
top = tree["children"][0]
|
|
sub = top["children"][0]
|
|
# H3 should be excluded from tree
|
|
assert len(sub["children"]) == 0
|
|
|
|
def test_content_tokens_captured(self):
|
|
md = "# Title\n\nParagraph text here."
|
|
tokens = parse_markdown_to_ast(md)
|
|
tree = build_section_tree(tokens)
|
|
|
|
section = tree["children"][0]
|
|
inline_tokens = [t for t in section["content_tokens"] if t.get("type") == "inline"]
|
|
assert len(inline_tokens) == 1
|
|
assert "Paragraph text here" in inline_tokens[0]["content"]
|
|
|
|
def test_slug_assigned(self):
|
|
md = "# Economic Domain\n\ntext"
|
|
tokens = parse_markdown_to_ast(md)
|
|
tree = build_section_tree(tokens)
|
|
|
|
assert tree["children"][0]["slug"] == "economic_domain"
|
|
|
|
def test_empty_document(self):
|
|
tokens = parse_markdown_to_ast("")
|
|
tree = build_section_tree(tokens)
|
|
assert tree["children"] == []
|
|
|
|
|
|
class TestExtractSectionText:
|
|
def test_simple_paragraph(self):
|
|
md = "# Title\n\nHello world."
|
|
tokens = parse_markdown_to_ast(md)
|
|
tree = build_section_tree(tokens)
|
|
text = extract_section_text(tree["children"][0])
|
|
assert text == "Hello world."
|
|
|
|
def test_multiple_paragraphs(self):
|
|
md = "# Title\n\nFirst paragraph.\n\nSecond paragraph."
|
|
tokens = parse_markdown_to_ast(md)
|
|
tree = build_section_tree(tokens)
|
|
text = extract_section_text(tree["children"][0])
|
|
assert "First paragraph." in text
|
|
assert "Second paragraph." in text
|
|
|
|
def test_empty_section(self):
|
|
section = {"content_tokens": []}
|
|
assert extract_section_text(section) == ""
|