""" Standalone section-tree utilities extracted from SchemaGenerator. Builds a hierarchical section tree from flat markdown-it AST tokens and provides helpers for navigating heading structure and extracting text. These functions are used by both the schema generator and the infospace entity parser. """ import re from typing import Any, Dict, List, Optional def slugify(text: str) -> str: """Convert heading or label text to a valid slug / JSON property key.""" replacements = { 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', 'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss', } slug = text for char, repl in replacements.items(): slug = slug.replace(char, repl) slug = slug.lower() slug = re.sub(r'[^a-z0-9]+', '_', slug) slug = slug.strip('_') return slug or 'feld' def extract_heading_level(tag: str) -> int: """Extract heading level from an HTML tag string (h1, h2, …).""" if tag.startswith('h') and len(tag) == 2: try: return int(tag[1]) except ValueError: pass return 1 def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str: """Return the inline text content following a ``heading_open`` token.""" for i in range(start_index, min(start_index + 3, len(tokens))): token = tokens[i] if token.get('type') == 'inline': return token.get('content', '') return '' def build_section_tree( tokens: List[Dict[str, Any]], max_depth: Optional[int] = None ) -> Dict[str, Any]: """ Build a hierarchical section tree from a flat markdown-it token list. Returns a root node whose ``children`` list contains the top-level sections. Each node carries: - ``heading`` – heading text (``None`` for the root) - ``level`` – heading depth (``0`` for the root) - ``slug`` – slugified heading - ``content_tokens`` – non-heading tokens belonging to this section - ``children`` – nested sub-sections """ root: Dict[str, Any] = { 'heading': None, 'level': 0, 'slug': '', 'content_tokens': [], 'children': [] } stack = [root] i = 0 while i < len(tokens): token = tokens[i] if token.get('type') == 'heading_open': level = extract_heading_level(token.get('tag', '')) heading_text = extract_heading_content(tokens, i) if max_depth is not None and level > max_depth: # Skip this heading and its close token, but keep content i += 1 while i < len(tokens) and tokens[i].get('type') != 'heading_close': i += 1 i += 1 continue section: Dict[str, Any] = { 'heading': heading_text, 'level': level, 'slug': slugify(heading_text), 'content_tokens': [], 'children': [] } # Pop stack until we find the parent (level < current) while len(stack) > 1 and stack[-1]['level'] >= level: stack.pop() stack[-1]['children'].append(section) stack.append(section) # Skip past heading_close i += 1 while i < len(tokens) and tokens[i].get('type') != 'heading_close': i += 1 else: # Add content token to current section stack[-1]['content_tokens'].append(token) i += 1 return root def extract_section_text(section: Dict[str, Any]) -> str: """ Return the plain text content of a section node. Concatenates the ``content`` field of every ``inline`` token found in the section's ``content_tokens``. Paragraphs are separated by newlines; other inline tokens are joined with spaces. """ parts: List[str] = [] for token in section.get('content_tokens', []): if token.get('type') == 'inline': parts.append(token.get('content', '')) return '\n'.join(parts)