markitect-main/markitect/core/section_tree.py

"""
Standalone section-tree utilities extracted from SchemaGenerator.

Builds a hierarchical section tree from flat markdown-it AST tokens and
provides helpers for navigating heading structure and extracting text.
These functions are used by both the schema generator and the infospace
entity parser.
"""

import re
from typing import Any, Dict, List, Optional


def slugify(text: str) -> str:
    """Convert heading or label text to a valid slug / JSON property key."""
    replacements = {
        'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
        'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
    }
    slug = text
    for char, repl in replacements.items():
        slug = slug.replace(char, repl)
    slug = slug.lower()
    slug = re.sub(r'[^a-z0-9]+', '_', slug)
    slug = slug.strip('_')
    return slug or 'feld'


def extract_heading_level(tag: str) -> int:
    """Extract heading level from an HTML tag string (h1, h2, …)."""
    if tag.startswith('h') and len(tag) == 2:
        try:
            return int(tag[1])
        except ValueError:
            pass
    return 1


def extract_heading_content(tokens: List[Dict[str, Any]], start_index: int) -> str:
    """Return the inline text content following a ``heading_open`` token."""
    for i in range(start_index, min(start_index + 3, len(tokens))):
        token = tokens[i]
        if token.get('type') == 'inline':
            return token.get('content', '')
    return ''


def build_section_tree(
    tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
) -> Dict[str, Any]:
    """
    Build a hierarchical section tree from a flat markdown-it token list.

    Returns a root node whose ``children`` list contains the top-level
    sections.  Each node carries:

    - ``heading`` – heading text (``None`` for the root)
    - ``level`` – heading depth (``0`` for the root)
    - ``slug`` – slugified heading
    - ``content_tokens`` – non-heading tokens belonging to this section
    - ``children`` – nested sub-sections
    """
    root: Dict[str, Any] = {
        'heading': None, 'level': 0, 'slug': '',
        'content_tokens': [], 'children': []
    }
    stack = [root]

    i = 0
    while i < len(tokens):
        token = tokens[i]
        if token.get('type') == 'heading_open':
            level = extract_heading_level(token.get('tag', ''))
            heading_text = extract_heading_content(tokens, i)

            if max_depth is not None and level > max_depth:
                # Skip this heading and its close token, but keep content
                i += 1
                while i < len(tokens) and tokens[i].get('type') != 'heading_close':
                    i += 1
                i += 1
                continue

            section: Dict[str, Any] = {
                'heading': heading_text,
                'level': level,
                'slug': slugify(heading_text),
                'content_tokens': [],
                'children': []
            }

            # Pop stack until we find the parent (level < current)
            while len(stack) > 1 and stack[-1]['level'] >= level:
                stack.pop()

            stack[-1]['children'].append(section)
            stack.append(section)

            # Skip past heading_close
            i += 1
            while i < len(tokens) and tokens[i].get('type') != 'heading_close':
                i += 1
        else:
            # Add content token to current section
            stack[-1]['content_tokens'].append(token)

        i += 1

    return root


def extract_section_text(section: Dict[str, Any]) -> str:
    """
    Return the plain text content of a section node.

    Concatenates the ``content`` field of every ``inline`` token found
    in the section's ``content_tokens``.  Paragraphs are separated by
    newlines; other inline tokens are joined with spaces.
    """
    parts: List[str] = []
    for token in section.get('content_tokens', []):
        if token.get('type') == 'inline':
            parts.append(token.get('content', ''))
    return '\n'.join(parts)