feat(schema): add semantic schema generation as default mode

schema-generate now builds content-aware schemas from the document's section hierarchy instead of counting markdown syntax elements. Detects key-value tables, data tables, link lists, and mixed content patterns to produce schemas that reflect the actual document outline. Old behavior preserved via --mode syntactic. Validator and visualization tools pinned to syntactic mode for compatibility. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 18:49:50 +01:00
parent 120ed89780
commit 60f33443ae
8 changed files with 408 additions and 55 deletions
--- a/markitect/schema/generator.py
+++ b/markitect/schema/generator.py
@@ -4,9 +4,15 @@ Schema Generator for Issue #5: Generate a Schema from a Markdown File.
 This module provides functionality to analyze markdown AST structures and generate
 JSON schemas that describe the document's structural elements with configurable
 depth limitations for architectural documentation analysis.
+
+Supports two generation modes:
+- semantic (default): Builds a content-aware schema from the document's section
+  hierarchy, detecting key-value tables, lists, and mixed content patterns.
+- syntactic: Counts markdown elements by type (legacy behavior).
 """

 import json
+import re
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Any, Optional, Set
@@ -44,7 +50,8 @@ class SchemaGenerator:
        Args:
            file_path: Path to the markdown file
            max_depth: Maximum heading depth to include (None = unlimited)
-            mode: Generation mode ('outline' for structure-focused schemas)
+            mode: Generation mode: None/'semantic' for content-aware schemas,
+                  'syntactic' for element counting, 'outline' for legacy outline mode
            outline_depth: Depth limit for outline mode
            capture_heading_text: Whether to capture exact heading text as constraints
            include_content_instructions: Whether to include content instruction fields
@@ -73,25 +80,381 @@ class SchemaGenerator:
        content = file_path.read_text(encoding='utf-8')
        ast_tokens = parse_markdown_to_ast(content)

-        # Analyze the AST structure
-        structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
+        # Auto-select syntactic mode when syntactic-only options are used
+        effective_mode = mode
+        if effective_mode is None and (capture_heading_text or include_content_instructions):
+            effective_mode = 'syntactic'

-        # Generate the JSON schema
-        schema = self._create_json_schema(
-            structure_analysis,
-            file_path.name,
-            mode=mode,
-            outline_depth=outline_depth,
-            capture_heading_text=capture_heading_text,
-            include_content_instructions=include_content_instructions,
-            instruction_type=instruction_type
-        )
+        # Dispatch based on mode
+        if effective_mode in ('syntactic', 'outline'):
+            # Legacy: syntactic element-counting schema
+            structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
+            schema = self._create_json_schema(
+                structure_analysis,
+                file_path.name,
+                mode=effective_mode,
+                outline_depth=outline_depth,
+                capture_heading_text=capture_heading_text,
+                include_content_instructions=include_content_instructions,
+                instruction_type=instruction_type
+            )
+        else:
+            # Default: semantic content-aware schema
+            schema = self._create_semantic_schema(ast_tokens, file_path.name, max_depth)

        return schema

+    # =========================================================================
+    # Semantic schema generation (default mode)
+    # =========================================================================
+
+    @staticmethod
+    def _slugify(text: str) -> str:
+        """Convert heading or label text to a valid JSON property key."""
+        replacements = {
+            'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
+            'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
+        }
+        slug = text
+        for char, repl in replacements.items():
+            slug = slug.replace(char, repl)
+        slug = slug.lower()
+        slug = re.sub(r'[^a-z0-9]+', '_', slug)
+        slug = slug.strip('_')
+        return slug or 'feld'
+
+    def _build_section_tree(
+        self, tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """
+        Build a hierarchical section tree from flat markdown-it token list.
+
+        Returns a root node with children. Each node has:
+        - heading: str (None for root)
+        - level: int (0 for root)
+        - slug: str
+        - content_tokens: list of non-heading tokens belonging to this section
+        - children: list of sub-sections
+        """
+        root = {
+            'heading': None, 'level': 0, 'slug': '',
+            'content_tokens': [], 'children': []
+        }
+        stack = [root]
+
+        i = 0
+        while i < len(tokens):
+            token = tokens[i]
+            if token.get('type') == 'heading_open':
+                level = self._extract_heading_level(token.get('tag', ''))
+                heading_text = self._extract_heading_content(tokens, i)
+
+                if max_depth is not None and level > max_depth:
+                    # Skip this heading and its close token, but keep content
+                    i += 1
+                    while i < len(tokens) and tokens[i].get('type') != 'heading_close':
+                        i += 1
+                    i += 1
+                    continue
+
+                section = {
+                    'heading': heading_text,
+                    'level': level,
+                    'slug': self._slugify(heading_text),
+                    'content_tokens': [],
+                    'children': []
+                }
+
+                # Pop stack until we find the parent (level < current)
+                while len(stack) > 1 and stack[-1]['level'] >= level:
+                    stack.pop()
+
+                stack[-1]['children'].append(section)
+                stack.append(section)
+
+                # Skip past heading_close
+                i += 1
+                while i < len(tokens) and tokens[i].get('type') != 'heading_close':
+                    i += 1
+            else:
+                # Add content token to current section
+                stack[-1]['content_tokens'].append(token)
+
+            i += 1
+
+        return root
+
+    def _extract_table_data(
+        self, tokens: List[Dict[str, Any]], start_index: int
+    ) -> Dict[str, Any]:
+        """Extract structured table data: headers and body rows."""
+        headers = []
+        rows = []
+        in_thead = False
+        current_row = []
+
+        i = start_index + 1  # skip table_open
+        while i < len(tokens):
+            ttype = tokens[i].get('type', '')
+            if ttype == 'table_close':
+                break
+            elif ttype == 'thead_open':
+                in_thead = True
+            elif ttype == 'thead_close':
+                in_thead = False
+            elif ttype == 'tr_open':
+                current_row = []
+            elif ttype == 'tr_close':
+                if in_thead:
+                    headers = current_row
+                else:
+                    rows.append(current_row)
+            elif ttype == 'inline':
+                current_row.append(tokens[i].get('content', '').strip())
+            i += 1
+
+        return {'headers': headers, 'rows': rows}
+
+    def _find_table_in_tokens(
+        self, content_tokens: List[Dict[str, Any]]
+    ) -> Optional[Dict[str, Any]]:
+        """Find and extract the first table in a section's content tokens."""
+        for i, token in enumerate(content_tokens):
+            if token.get('type') == 'table_open':
+                return self._extract_table_data(content_tokens, i)
+        return None
+
+    def _extract_list_items_text(
+        self, content_tokens: List[Dict[str, Any]]
+    ) -> List[str]:
+        """Extract text content of top-level list items from section tokens."""
+        items = []
+        in_list_item = False
+        nesting = 0
+        item_text_parts = []
+
+        for token in content_tokens:
+            ttype = token.get('type', '')
+            if ttype == 'list_item_open':
+                if nesting == 0:
+                    in_list_item = True
+                    item_text_parts = []
+                nesting += 1
+            elif ttype == 'list_item_close':
+                nesting -= 1
+                if nesting == 0:
+                    in_list_item = False
+                    items.append(' '.join(item_text_parts).strip())
+            elif ttype == 'inline' and in_list_item and nesting == 1:
+                item_text_parts.append(token.get('content', ''))
+
+        return items
+
+    @staticmethod
+    def _is_key_value_table(table_data: Dict[str, Any]) -> bool:
+        """Detect if a table is a 2-column key-value table (empty headers, 2 cols per row)."""
+        if not table_data or not table_data.get('rows'):
+            return False
+        # All rows must have exactly 2 columns
+        if not all(len(row) == 2 for row in table_data['rows']):
+            return False
+        # Headers must be empty or absent
+        if table_data.get('headers'):
+            if not all(h.strip() == '' for h in table_data['headers']):
+                return False
+        return True
+
+    @staticmethod
+    def _has_top_level_paragraphs(tokens: List[Dict[str, Any]]) -> bool:
+        """Check for paragraph tokens that are NOT nested inside list items."""
+        list_depth = 0
+        for t in tokens:
+            ttype = t.get('type', '')
+            if ttype in ('bullet_list_open', 'ordered_list_open'):
+                list_depth += 1
+            elif ttype in ('bullet_list_close', 'ordered_list_close'):
+                list_depth -= 1
+            elif ttype == 'paragraph_open' and list_depth == 0:
+                return True
+        return False
+
+    def _section_to_schema(self, section: Dict[str, Any]) -> Dict[str, Any]:
+        """Convert a section tree node into its JSON schema representation."""
+        content_tokens = section['content_tokens']
+        children = section['children']
+        heading = section.get('heading', '')
+
+        # Detect content types present in this section
+        table_data = self._find_table_in_tokens(content_tokens)
+        has_list = any(
+            t.get('type') in ('bullet_list_open', 'ordered_list_open')
+            for t in content_tokens
+        )
+        has_paragraphs = self._has_top_level_paragraphs(content_tokens)
+
+        # --- Case 1: Key-value table → object with named properties ---
+        if table_data and self._is_key_value_table(table_data):
+            properties = {}
+            used_keys: set = set()
+            for row in table_data['rows']:
+                key = self._slugify(row[0])
+                # Deduplicate keys
+                original_key = key
+                counter = 2
+                while key in used_keys:
+                    key = f"{original_key}_{counter}"
+                    counter += 1
+                used_keys.add(key)
+                properties[key] = {
+                    "type": "string",
+                    "description": row[0]
+                }
+
+            schema: Dict[str, Any] = {
+                "type": "object",
+                "description": heading,
+                "properties": properties
+            }
+
+            # Merge child sections as additional properties
+            for child in children:
+                schema["properties"][child['slug']] = self._section_to_schema(child)
+
+            return schema
+
+        # --- Case 2: Data table with meaningful headers → array of objects ---
+        if table_data and not self._is_key_value_table(table_data) and table_data.get('headers'):
+            item_properties = {}
+            for hdr in table_data['headers']:
+                key = self._slugify(hdr)
+                if key:
+                    item_properties[key] = {"type": "string", "description": hdr}
+
+            return {
+                "type": "array",
+                "description": heading,
+                "items": {
+                    "type": "object",
+                    "properties": item_properties
+                }
+            }
+
+        # --- Case 3: Pure list (no child sections) ---
+        if has_list and not children:
+            list_items = self._extract_list_items_text(content_tokens)
+            items_have_links = any('[' in it and '](' in it for it in list_items)
+
+            if has_paragraphs:
+                # Mixed: paragraphs + list
+                item_schema: Any = (
+                    {
+                        "type": "object",
+                        "properties": {
+                            "name": {"type": "string"},
+                            "link": {"type": "string", "format": "uri"}
+                        },
+                        "required": ["name"]
+                    }
+                    if items_have_links
+                    else {"type": "string"}
+                )
+                return {
+                    "type": "object",
+                    "description": heading,
+                    "properties": {
+                        "freitext": {"type": "string"},
+                        "eintraege": {"type": "array", "items": item_schema}
+                    }
+                }
+
+            # Pure list
+            if items_have_links:
+                return {
+                    "type": "array",
+                    "description": heading,
+                    "items": {
+                        "type": "object",
+                        "properties": {
+                            "name": {"type": "string"},
+                            "link": {"type": "string", "format": "uri"}
+                        },
+                        "required": ["name"]
+                    }
+                }
+
+            return {
+                "type": "array",
+                "description": heading,
+                "items": {"type": "string"}
+            }
+
+        # --- Case 4: Section with child sub-sections ---
+        if children:
+            properties: Dict[str, Any] = {}
+
+            # Direct content before first child
+            if has_paragraphs or has_list:
+                if has_list:
+                    properties["eintraege"] = {
+                        "type": "array",
+                        "items": {"type": "string"}
+                    }
+                else:
+                    properties["inhalt"] = {"type": "string"}
+
+            for child in children:
+                properties[child['slug']] = self._section_to_schema(child)
+
+            return {
+                "type": "object",
+                "description": heading,
+                "properties": properties
+            }
+
+        # --- Case 5: Text-only section ---
+        if has_paragraphs:
+            return {
+                "type": "string",
+                "description": heading
+            }
+
+        # --- Default: empty or unrecognized section ---
+        return {
+            "type": "string",
+            "description": heading
+        }
+
+    def _create_semantic_schema(
+        self,
+        tokens: List[Dict[str, Any]],
+        filename: str,
+        max_depth: Optional[int] = None
+    ) -> Dict[str, Any]:
+        """Create a semantic JSON schema from the document's section hierarchy."""
+        tree = self._build_section_tree(tokens, max_depth)
+
+        schema = {
+            "$schema": self.default_schema_url,
+            "type": "object",
+            "title": f"Schema from {filename}",
+            "description": f"Semantic schema describing the content structure of {filename}",
+            "properties": {}
+        }
+
+        # Build properties from top-level sections
+        for section in tree['children']:
+            section_schema = self._section_to_schema(section)
+            schema["properties"][section['slug']] = section_schema
+
+        return schema
+
+    # =========================================================================
+    # Syntactic schema generation (legacy mode: --mode syntactic / --mode outline)
+    # =========================================================================
+
    def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]:
        """
-        Analyze AST tokens to extract structural patterns.
+        Analyze AST tokens to extract structural patterns (element counting).

        Args:
            tokens: List of AST tokens from markdown-it
@@ -208,7 +571,7 @@ class SchemaGenerator:
        instruction_type: str = 'description'
    ) -> Dict[str, Any]:
        """
-        Create a JSON schema from structural analysis.
+        Create a JSON schema from structural analysis (syntactic/outline mode).

        Args:
            analysis: Structural analysis of the document
@@ -364,6 +727,10 @@ class SchemaGenerator:

        return schema

+    # =========================================================================
+    # Shared helpers
+    # =========================================================================
+
    def _extract_heading_level(self, tag: str) -> int:
        """Extract heading level from HTML tag (h1, h2, etc.)."""
        if tag.startswith('h') and len(tag) == 2:
@@ -393,11 +760,9 @@ class SchemaGenerator:

    def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
        """Extract list structure information."""
-        # This is a simplified implementation
-        # In a full implementation, we'd parse the nested list structure
        return {
            "type": "list",
-            "estimated_items": 1  # Placeholder - would need more complex parsing
+            "estimated_items": 1
        }

    def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]:
@@ -409,15 +774,13 @@ class SchemaGenerator:

    def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
        """Extract blockquote content."""
-        # Simplified implementation
        return "blockquote content"

    def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
-        """Extract table structure information."""
-        # Simplified implementation
+        """Extract table structure information (legacy syntactic mode)."""
        return {
-            "columns": 2,  # Placeholder
-            "rows": 1      # Placeholder
+            "columns": 2,
+            "rows": 1
        }

    def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]:
@@ -443,16 +806,7 @@ class SchemaGenerator:
        return result

    def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str:
-        """
-        Generate appropriate content instruction text based on heading and instruction type.
-
-        Args:
-            heading_text: The text of the heading
-            instruction_type: Type of instruction to generate
-
-        Returns:
-            Instruction text for the content field
-        """
+        """Generate content instruction text based on heading and instruction type."""
        if instruction_type == "description":
            return f"Provide content for the '{heading_text}' section"
        elif instruction_type == "example":
@@ -462,5 +816,4 @@ class SchemaGenerator:
        elif instruction_type == "template":
            return f"Template content for '{heading_text}' section"
        else:
-            # Default fallback
            return f"Content for the '{heading_text}' section"