Files
markitect-main/markitect/core/section_tree.py
tegwick 03c6c5e8de feat(infospace): add entity metadata parser (S1.1)
Extract section-tree algorithm from SchemaGenerator into standalone
core/section_tree.py and build markitect/infospace/ package with
EntityMeta dataclass and parse_entity_file/parse_entity_directory.
Foundation for schema compliance, coverage, and granularity metrics.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 00:27:45 +01:00

125 lines
4.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Standalone section-tree utilities extracted from SchemaGenerator.
Builds a hierarchical section tree from flat markdown-it AST tokens and
provides helpers for navigating heading structure and extracting text.
These functions are used by both the schema generator and the infospace
entity parser.
"""
import re
from typing import Any, Dict, List, Optional
def slugify(text: str) -> str:
"""Convert heading or label text to a valid slug / JSON property key."""
replacements = {
'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
}
slug = text
for char, repl in replacements.items():
slug = slug.replace(char, repl)
slug = slug.lower()
slug = re.sub(r'[^a-z0-9]+', '_', slug)
slug = slug.strip('_')
return slug or 'feld'
def extract_heading_level(tag: str) -> int:
"""Extract heading level from an HTML tag string (h1, h2, …)."""
if tag.startswith('h') and len(tag) == 2:
try:
return int(tag[1])
except ValueError:
pass
return 1
def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str:
"""Return the inline text content following a ``heading_open`` token."""
for i in range(start_index, min(start_index + 3, len(tokens))):
token = tokens[i]
if token.get('type') == 'inline':
return token.get('content', '')
return ''
def build_section_tree(
tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
) -> Dict[str, Any]:
"""
Build a hierarchical section tree from a flat markdown-it token list.
Returns a root node whose ``children`` list contains the top-level
sections. Each node carries:
- ``heading`` heading text (``None`` for the root)
- ``level`` heading depth (``0`` for the root)
- ``slug`` slugified heading
- ``content_tokens`` non-heading tokens belonging to this section
- ``children`` nested sub-sections
"""
root: Dict[str, Any] = {
'heading': None, 'level': 0, 'slug': '',
'content_tokens': [], 'children': []
}
stack = [root]
i = 0
while i < len(tokens):
token = tokens[i]
if token.get('type') == 'heading_open':
level = extract_heading_level(token.get('tag', ''))
heading_text = extract_heading_content(tokens, i)
if max_depth is not None and level > max_depth:
# Skip this heading and its close token, but keep content
i += 1
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
i += 1
i += 1
continue
section: Dict[str, Any] = {
'heading': heading_text,
'level': level,
'slug': slugify(heading_text),
'content_tokens': [],
'children': []
}
# Pop stack until we find the parent (level < current)
while len(stack) > 1 and stack[-1]['level'] >= level:
stack.pop()
stack[-1]['children'].append(section)
stack.append(section)
# Skip past heading_close
i += 1
while i < len(tokens) and tokens[i].get('type') != 'heading_close':
i += 1
else:
# Add content token to current section
stack[-1]['content_tokens'].append(token)
i += 1
return root
def extract_section_text(section: Dict[str, Any]) -> str:
"""
Return the plain text content of a section node.
Concatenates the ``content`` field of every ``inline`` token found
in the section's ``content_tokens``. Paragraphs are separated by
newlines; other inline tokens are joined with spaces.
"""
parts: List[str] = []
for token in section.get('content_tokens', []):
if token.get('type') == 'inline':
parts.append(token.get('content', ''))
return '\n'.join(parts)