Extract section-tree algorithm from SchemaGenerator into standalone core/section_tree.py and build markitect/infospace/ package with EntityMeta dataclass and parse_entity_file/parse_entity_directory. Foundation for schema compliance, coverage, and granularity metrics. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
125 lines
4.0 KiB
Python
125 lines
4.0 KiB
Python
"""
|
||
Standalone section-tree utilities extracted from SchemaGenerator.
|
||
|
||
Builds a hierarchical section tree from flat markdown-it AST tokens and
|
||
provides helpers for navigating heading structure and extracting text.
|
||
These functions are used by both the schema generator and the infospace
|
||
entity parser.
|
||
"""
|
||
|
||
import re
|
||
from typing import Any, Dict, List, Optional
|
||
|
||
|
||
def slugify(text: str) -> str:
|
||
"""Convert heading or label text to a valid slug / JSON property key."""
|
||
replacements = {
|
||
'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
|
||
'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
|
||
}
|
||
slug = text
|
||
for char, repl in replacements.items():
|
||
slug = slug.replace(char, repl)
|
||
slug = slug.lower()
|
||
slug = re.sub(r'[^a-z0-9]+', '_', slug)
|
||
slug = slug.strip('_')
|
||
return slug or 'feld'
|
||
|
||
|
||
def extract_heading_level(tag: str) -> int:
|
||
"""Extract heading level from an HTML tag string (h1, h2, …)."""
|
||
if tag.startswith('h') and len(tag) == 2:
|
||
try:
|
||
return int(tag[1])
|
||
except ValueError:
|
||
pass
|
||
return 1
|
||
|
||
|
||
def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str:
|
||
"""Return the inline text content following a ``heading_open`` token."""
|
||
for i in range(start_index, min(start_index + 3, len(tokens))):
|
||
token = tokens[i]
|
||
if token.get('type') == 'inline':
|
||
return token.get('content', '')
|
||
return ''
|
||
|
||
|
||
def build_section_tree(
|
||
tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Build a hierarchical section tree from a flat markdown-it token list.
|
||
|
||
Returns a root node whose ``children`` list contains the top-level
|
||
sections. Each node carries:
|
||
|
||
- ``heading`` – heading text (``None`` for the root)
|
||
- ``level`` – heading depth (``0`` for the root)
|
||
- ``slug`` – slugified heading
|
||
- ``content_tokens`` – non-heading tokens belonging to this section
|
||
- ``children`` – nested sub-sections
|
||
"""
|
||
root: Dict[str, Any] = {
|
||
'heading': None, 'level': 0, 'slug': '',
|
||
'content_tokens': [], 'children': []
|
||
}
|
||
stack = [root]
|
||
|
||
i = 0
|
||
while i < len(tokens):
|
||
token = tokens[i]
|
||
if token.get('type') == 'heading_open':
|
||
level = extract_heading_level(token.get('tag', ''))
|
||
heading_text = extract_heading_content(tokens, i)
|
||
|
||
if max_depth is not None and level > max_depth:
|
||
# Skip this heading and its close token, but keep content
|
||
i += 1
|
||
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
|
||
i += 1
|
||
i += 1
|
||
continue
|
||
|
||
section: Dict[str, Any] = {
|
||
'heading': heading_text,
|
||
'level': level,
|
||
'slug': slugify(heading_text),
|
||
'content_tokens': [],
|
||
'children': []
|
||
}
|
||
|
||
# Pop stack until we find the parent (level < current)
|
||
while len(stack) > 1 and stack[-1]['level'] >= level:
|
||
stack.pop()
|
||
|
||
stack[-1]['children'].append(section)
|
||
stack.append(section)
|
||
|
||
# Skip past heading_close
|
||
i += 1
|
||
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
|
||
i += 1
|
||
else:
|
||
# Add content token to current section
|
||
stack[-1]['content_tokens'].append(token)
|
||
|
||
i += 1
|
||
|
||
return root
|
||
|
||
|
||
def extract_section_text(section: Dict[str, Any]) -> str:
|
||
"""
|
||
Return the plain text content of a section node.
|
||
|
||
Concatenates the ``content`` field of every ``inline`` token found
|
||
in the section's ``content_tokens``. Paragraphs are separated by
|
||
newlines; other inline tokens are joined with spaces.
|
||
"""
|
||
parts: List[str] = []
|
||
for token in section.get('content_tokens', []):
|
||
if token.get('type') == 'inline':
|
||
parts.append(token.get('content', ''))
|
||
return '\n'.join(parts)
|