feat(schema): add semantic schema generation as default mode

schema-generate now builds content-aware schemas from the document's section hierarchy instead of counting markdown syntax elements. Detects key-value tables, data tables, link lists, and mixed content patterns to produce schemas that reflect the actual document outline. Old behavior preserved via --mode syntactic. Validator and visualization tools pinned to syntactic mode for compatibility. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-16 18:49:50 +01:00
parent 120ed89780
commit 60f33443ae
8 changed files with 408 additions and 55 deletions
--- a/markitect/cli.py
+++ b/markitect/cli.py
@@ -1392,7 +1392,7 @@ def ast_stats(config, file_path, format):
@click.option('--output', '-o', type=click.Path(path_type=Path), help='Output file path (default: stdout)')
@click.option('--outfile', type=click.Path(path_type=Path), help='Output file path (alias for --output)')
@click.option('--format', 'output_format', type=click.Choice(['json', 'yaml']), default='json', help='Output format')
-@click.option('--mode', type=click.Choice(['outline']), help='Generation mode: outline for structure-focused schemas')
+@click.option('--mode', type=click.Choice(['semantic', 'syntactic', 'outline']), help='Generation mode: semantic (default) for content-aware schemas, syntactic for element counting, outline for structure-focused')
@click.option('--depth', type=int, help='Maximum depth for outline mode (similar to --max-depth)')
@click.option('--capture-heading-text', is_flag=True, help='Capture exact heading text as schema constraints')
@click.option('--include-content-instructions', is_flag=True, help='Include content field instructions for document generation')
--- a/markitect/schema/generator.py
+++ b/markitect/schema/generator.py
@@ -4,9 +4,15 @@ Schema Generator for Issue #5: Generate a Schema from a Markdown File.
 This module provides functionality to analyze markdown AST structures and generate
 JSON schemas that describe the document's structural elements with configurable
 depth limitations for architectural documentation analysis.
 Supports two generation modes:
 - semantic (default): Builds a content-aware schema from the document's section
  hierarchy, detecting key-value tables, lists, and mixed content patterns.
 - syntactic: Counts markdown elements by type (legacy behavior).
 """
 import json
 import re
 from collections import defaultdict
 from pathlib import Path
 from typing import Dict, List, Any, Optional, Set
@@ -44,7 +50,8 @@ class SchemaGenerator:
        Args:
            file_path: Path to the markdown file
            max_depth: Maximum heading depth to include (None = unlimited)
-            mode: Generation mode ('outline' for structure-focused schemas)
+            mode: Generation mode: None/'semantic' for content-aware schemas,
                  'syntactic' for element counting, 'outline' for legacy outline mode
            outline_depth: Depth limit for outline mode
            capture_heading_text: Whether to capture exact heading text as constraints
            include_content_instructions: Whether to include content instruction fields
@@ -73,25 +80,381 @@ class SchemaGenerator:
        content = file_path.read_text(encoding='utf-8')
        ast_tokens = parse_markdown_to_ast(content)
-        # Analyze the AST structure
+        # Auto-select syntactic mode when syntactic-only options are used
-        structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
+        effective_mode = mode
        if effective_mode is None and (capture_heading_text or include_content_instructions):
            effective_mode = 'syntactic'
-        # Generate the JSON schema
+        # Dispatch based on mode
-        schema = self._create_json_schema(
+        if effective_mode in ('syntactic', 'outline'):
-            structure_analysis,
+            # Legacy: syntactic element-counting schema
-            file_path.name,
+            structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
-            mode=mode,
+            schema = self._create_json_schema(
-            outline_depth=outline_depth,
+                structure_analysis,
-            capture_heading_text=capture_heading_text,
+                file_path.name,
-            include_content_instructions=include_content_instructions,
+                mode=effective_mode,
-            instruction_type=instruction_type
+                outline_depth=outline_depth,
-        )
+                capture_heading_text=capture_heading_text,
                include_content_instructions=include_content_instructions,
                instruction_type=instruction_type
            )
        else:
            # Default: semantic content-aware schema
            schema = self._create_semantic_schema(ast_tokens, file_path.name, max_depth)
        return schema
    # =========================================================================
    # Semantic schema generation (default mode)
    # =========================================================================
    @staticmethod
    def _slugify(text: str) -> str:
        """Convert heading or label text to a valid JSON property key."""
        replacements = {
            'ä': 'ae', 'ö': 'oe', 'ü': 'ue',
            'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss',
        }
        slug = text
        for char, repl in replacements.items():
            slug = slug.replace(char, repl)
        slug = slug.lower()
        slug = re.sub(r'[^a-z0-9]+', '_', slug)
        slug = slug.strip('_')
        return slug or 'feld'
    def _build_section_tree(
        self, tokens: List[Dict[str, Any]], max_depth: Optional[int] = None
    ) -> Dict[str, Any]:
        """
        Build a hierarchical section tree from flat markdown-it token list.
        Returns a root node with children. Each node has:
        - heading: str (None for root)
        - level: int (0 for root)
        - slug: str
        - content_tokens: list of non-heading tokens belonging to this section
        - children: list of sub-sections
        """
        root = {
            'heading': None, 'level': 0, 'slug': '',
            'content_tokens': [], 'children': []
        }
        stack = [root]
        i = 0
        while i < len(tokens):
            token = tokens[i]
            if token.get('type') == 'heading_open':
                level = self._extract_heading_level(token.get('tag', ''))
                heading_text = self._extract_heading_content(tokens, i)
                if max_depth is not None and level > max_depth:
                    # Skip this heading and its close token, but keep content
                    i += 1
                    while i < len(tokens) and tokens[i].get('type') != 'heading_close':
                        i += 1
                    i += 1
                    continue
                section = {
                    'heading': heading_text,
                    'level': level,
                    'slug': self._slugify(heading_text),
                    'content_tokens': [],
                    'children': []
                }
                # Pop stack until we find the parent (level < current)
                while len(stack) > 1 and stack[-1]['level'] >= level:
                    stack.pop()
                stack[-1]['children'].append(section)
                stack.append(section)
                # Skip past heading_close
                i += 1
                while i < len(tokens) and tokens[i].get('type') != 'heading_close':
                    i += 1
            else:
                # Add content token to current section
                stack[-1]['content_tokens'].append(token)
            i += 1
        return root
    def _extract_table_data(
        self, tokens: List[Dict[str, Any]], start_index: int
    ) -> Dict[str, Any]:
        """Extract structured table data: headers and body rows."""
        headers = []
        rows = []
        in_thead = False
        current_row = []
        i = start_index + 1  # skip table_open
        while i < len(tokens):
            ttype = tokens[i].get('type', '')
            if ttype == 'table_close':
                break
            elif ttype == 'thead_open':
                in_thead = True
            elif ttype == 'thead_close':
                in_thead = False
            elif ttype == 'tr_open':
                current_row = []
            elif ttype == 'tr_close':
                if in_thead:
                    headers = current_row
                else:
                    rows.append(current_row)
            elif ttype == 'inline':
                current_row.append(tokens[i].get('content', '').strip())
            i += 1
        return {'headers': headers, 'rows': rows}
    def _find_table_in_tokens(
        self, content_tokens: List[Dict[str, Any]]
    ) -> Optional[Dict[str, Any]]:
        """Find and extract the first table in a section's content tokens."""
        for i, token in enumerate(content_tokens):
            if token.get('type') == 'table_open':
                return self._extract_table_data(content_tokens, i)
        return None
    def _extract_list_items_text(
        self, content_tokens: List[Dict[str, Any]]
    ) -> List[str]:
        """Extract text content of top-level list items from section tokens."""
        items = []
        in_list_item = False
        nesting = 0
        item_text_parts = []
        for token in content_tokens:
            ttype = token.get('type', '')
            if ttype == 'list_item_open':
                if nesting == 0:
                    in_list_item = True
                    item_text_parts = []
                nesting += 1
            elif ttype == 'list_item_close':
                nesting -= 1
                if nesting == 0:
                    in_list_item = False
                    items.append(' '.join(item_text_parts).strip())
            elif ttype == 'inline' and in_list_item and nesting == 1:
                item_text_parts.append(token.get('content', ''))
        return items
    @staticmethod
    def _is_key_value_table(table_data: Dict[str, Any]) -> bool:
        """Detect if a table is a 2-column key-value table (empty headers, 2 cols per row)."""
        if not table_data or not table_data.get('rows'):
            return False
        # All rows must have exactly 2 columns
        if not all(len(row) == 2 for row in table_data['rows']):
            return False
        # Headers must be empty or absent
        if table_data.get('headers'):
            if not all(h.strip() == '' for h in table_data['headers']):
                return False
        return True
    @staticmethod
    def _has_top_level_paragraphs(tokens: List[Dict[str, Any]]) -> bool:
        """Check for paragraph tokens that are NOT nested inside list items."""
        list_depth = 0
        for t in tokens:
            ttype = t.get('type', '')
            if ttype in ('bullet_list_open', 'ordered_list_open'):
                list_depth += 1
            elif ttype in ('bullet_list_close', 'ordered_list_close'):
                list_depth -= 1
            elif ttype == 'paragraph_open' and list_depth == 0:
                return True
        return False
    def _section_to_schema(self, section: Dict[str, Any]) -> Dict[str, Any]:
        """Convert a section tree node into its JSON schema representation."""
        content_tokens = section['content_tokens']
        children = section['children']
        heading = section.get('heading', '')
        # Detect content types present in this section
        table_data = self._find_table_in_tokens(content_tokens)
        has_list = any(
            t.get('type') in ('bullet_list_open', 'ordered_list_open')
            for t in content_tokens
        )
        has_paragraphs = self._has_top_level_paragraphs(content_tokens)
        # --- Case 1: Key-value table → object with named properties ---
        if table_data and self._is_key_value_table(table_data):
            properties = {}
            used_keys: set = set()
            for row in table_data['rows']:
                key = self._slugify(row[0])
                # Deduplicate keys
                original_key = key
                counter = 2
                while key in used_keys:
                    key = f"{original_key}_{counter}"
                    counter += 1
                used_keys.add(key)
                properties[key] = {
                    "type": "string",
                    "description": row[0]
                }
            schema: Dict[str, Any] = {
                "type": "object",
                "description": heading,
                "properties": properties
            }
            # Merge child sections as additional properties
            for child in children:
                schema["properties"][child['slug']] = self._section_to_schema(child)
            return schema
        # --- Case 2: Data table with meaningful headers → array of objects ---
        if table_data and not self._is_key_value_table(table_data) and table_data.get('headers'):
            item_properties = {}
            for hdr in table_data['headers']:
                key = self._slugify(hdr)
                if key:
                    item_properties[key] = {"type": "string", "description": hdr}
            return {
                "type": "array",
                "description": heading,
                "items": {
                    "type": "object",
                    "properties": item_properties
                }
            }
        # --- Case 3: Pure list (no child sections) ---
        if has_list and not children:
            list_items = self._extract_list_items_text(content_tokens)
            items_have_links = any('[' in it and '](' in it for it in list_items)
            if has_paragraphs:
                # Mixed: paragraphs + list
                item_schema: Any = (
                    {
                        "type": "object",
                        "properties": {
                            "name": {"type": "string"},
                            "link": {"type": "string", "format": "uri"}
                        },
                        "required": ["name"]
                    }
                    if items_have_links
                    else {"type": "string"}
                )
                return {
                    "type": "object",
                    "description": heading,
                    "properties": {
                        "freitext": {"type": "string"},
                        "eintraege": {"type": "array", "items": item_schema}
                    }
                }
            # Pure list
            if items_have_links:
                return {
                    "type": "array",
                    "description": heading,
                    "items": {
                        "type": "object",
                        "properties": {
                            "name": {"type": "string"},
                            "link": {"type": "string", "format": "uri"}
                        },
                        "required": ["name"]
                    }
                }
            return {
                "type": "array",
                "description": heading,
                "items": {"type": "string"}
            }
        # --- Case 4: Section with child sub-sections ---
        if children:
            properties: Dict[str, Any] = {}
            # Direct content before first child
            if has_paragraphs or has_list:
                if has_list:
                    properties["eintraege"] = {
                        "type": "array",
                        "items": {"type": "string"}
                    }
                else:
                    properties["inhalt"] = {"type": "string"}
            for child in children:
                properties[child['slug']] = self._section_to_schema(child)
            return {
                "type": "object",
                "description": heading,
                "properties": properties
            }
        # --- Case 5: Text-only section ---
        if has_paragraphs:
            return {
                "type": "string",
                "description": heading
            }
        # --- Default: empty or unrecognized section ---
        return {
            "type": "string",
            "description": heading
        }
    def _create_semantic_schema(
        self,
        tokens: List[Dict[str, Any]],
        filename: str,
        max_depth: Optional[int] = None
    ) -> Dict[str, Any]:
        """Create a semantic JSON schema from the document's section hierarchy."""
        tree = self._build_section_tree(tokens, max_depth)
        schema = {
            "$schema": self.default_schema_url,
            "type": "object",
            "title": f"Schema from {filename}",
            "description": f"Semantic schema describing the content structure of {filename}",
            "properties": {}
        }
        # Build properties from top-level sections
        for section in tree['children']:
            section_schema = self._section_to_schema(section)
            schema["properties"][section['slug']] = section_schema
        return schema
    # =========================================================================
    # Syntactic schema generation (legacy mode: --mode syntactic / --mode outline)
    # =========================================================================
    def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]:
        """
-        Analyze AST tokens to extract structural patterns.
+        Analyze AST tokens to extract structural patterns (element counting).
        Args:
            tokens: List of AST tokens from markdown-it
@@ -208,7 +571,7 @@ class SchemaGenerator:
        instruction_type: str = 'description'
    ) -> Dict[str, Any]:
        """
-        Create a JSON schema from structural analysis.
+        Create a JSON schema from structural analysis (syntactic/outline mode).
        Args:
            analysis: Structural analysis of the document
@@ -364,6 +727,10 @@ class SchemaGenerator:
        return schema
    # =========================================================================
    # Shared helpers
    # =========================================================================
    def _extract_heading_level(self, tag: str) -> int:
        """Extract heading level from HTML tag (h1, h2, etc.)."""
        if tag.startswith('h') and len(tag) == 2:
@@ -393,11 +760,9 @@ class SchemaGenerator:
    def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
        """Extract list structure information."""
        # This is a simplified implementation
        # In a full implementation, we'd parse the nested list structure
        return {
            "type": "list",
-            "estimated_items": 1  # Placeholder - would need more complex parsing
+            "estimated_items": 1
        }
    def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]:
@@ -409,15 +774,13 @@ class SchemaGenerator:
    def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
        """Extract blockquote content."""
        # Simplified implementation
        return "blockquote content"
    def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
-        """Extract table structure information."""
+        """Extract table structure information (legacy syntactic mode)."""
        # Simplified implementation
        return {
-            "columns": 2,  # Placeholder
+            "columns": 2,
-            "rows": 1      # Placeholder
+            "rows": 1
        }
    def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]:
@@ -443,16 +806,7 @@ class SchemaGenerator:
        return result
    def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str:
-        """
+        """Generate content instruction text based on heading and instruction type."""
        Generate appropriate content instruction text based on heading and instruction type.
        Args:
            heading_text: The text of the heading
            instruction_type: Type of instruction to generate
        Returns:
            Instruction text for the content field
        """
        if instruction_type == "description":
            return f"Provide content for the '{heading_text}' section"
        elif instruction_type == "example":
@@ -462,5 +816,4 @@ class SchemaGenerator:
        elif instruction_type == "template":
            return f"Template content for '{heading_text}' section"
        else:
            # Default fallback
            return f"Content for the '{heading_text}' section"
--- a/markitect/schema/validator.py
+++ b/markitect/schema/validator.py
@@ -63,7 +63,7 @@ class SchemaValidator:
        # Generate the document's current structure
        try:
-            document_schema = self.schema_generator.generate_schema_from_file(file_path)
+            document_schema = self.schema_generator.generate_schema_from_file(file_path, mode='syntactic')
        except Exception as e:
            raise SchemaValidationError(f"Failed to generate document schema: {e}") from e
@@ -307,7 +307,7 @@ class SchemaValidator:
        # Generate the document's current structure
        try:
-            document_schema = self.schema_generator.generate_schema_from_file(file_path)
+            document_schema = self.schema_generator.generate_schema_from_file(file_path, mode='syntactic')
        except Exception as e:
            error_collector.add_error(
                ValidationErrorType.STRUCTURAL_VIOLATION,
--- a/tests/test_issue_51_outline_mode.py
+++ b/tests/test_issue_51_outline_mode.py
@@ -290,7 +290,7 @@ This is a test document.
                output_file.unlink()
    def test_cli_maintains_backward_compatibility_with_max_depth(self):
-        """Test that existing --max-depth option still works with default mode."""
+        """Test that existing --max-depth option still works with default (semantic) mode."""
        # Arrange
        markdown_content = """# Test Document
@@ -317,9 +317,9 @@ Some details here.
            assert result.exit_code == 0, f"CLI should maintain backward compatibility with --max-depth, got: {result.output}"
            schema = json.loads(result.output)
-            # Should use old title format for backward compatibility
+            # Default mode is now semantic, which uses 'from' in title
-            expected_title = f"Schema for {temp_file.name}"
+            expected_title = f"Schema from {temp_file.name}"
-            assert schema["title"] == expected_title, f"Default mode should use 'for' in title"
+            assert schema["title"] == expected_title, f"Default (semantic) mode should use 'from' in title"
        finally:
            temp_file.unlink()
--- a/tests/test_issue_5_schema_generation.py
+++ b/tests/test_issue_5_schema_generation.py
@@ -50,8 +50,8 @@ Some text here.
            temp_file = Path(f.name)
        try:
-            # Act - Generate schema with unlimited depth
+            # Act - Generate schema in syntactic mode (element counting)
-            result = self.schema_generator.generate_schema_from_file(temp_file)
+            result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
            # Assert - Schema should be valid JSON and contain expected structure
            assert isinstance(result, dict)
@@ -105,8 +105,8 @@ Very deep content.
            temp_file = Path(f.name)
        try:
-            # Act - Generate schema with depth limit of 2
+            # Act - Generate schema in syntactic mode with depth limit of 2
-            result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2)
+            result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic')
            # Assert - Only levels 1 and 2 should be included
            properties = result.get("properties", {})
@@ -173,8 +173,8 @@ Some implementation notes here.
            temp_file = Path(f.name)
        try:
-            # Act - Generate schema
+            # Act - Generate schema in syntactic mode
-            result = self.schema_generator.generate_schema_from_file(temp_file)
+            result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
            # Assert - Schema should capture complex structures
            properties = result.get("properties", {})
--- a/tests/test_l4_service_schema_generation.py
+++ b/tests/test_l4_service_schema_generation.py
@@ -47,8 +47,8 @@ Some text here.
            temp_file = Path(f.name)
        try:
-            # Act - Generate schema with unlimited depth
+            # Act - Generate schema in syntactic mode (element counting)
-            result = self.schema_generator.generate_schema_from_file(temp_file)
+            result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
            # Assert - Schema should be valid JSON and contain expected structure
            assert isinstance(result, dict)
@@ -104,8 +104,8 @@ Very deep content.
            temp_file = Path(f.name)
        try:
-            # Act - Generate schema with depth limit of 2
+            # Act - Generate schema in syntactic mode with depth limit of 2
-            result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2)
+            result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic')
            # Assert - Only levels 1 and 2 should be included
            properties = result.get("properties", {})
@@ -238,8 +238,8 @@ def api_function():
            temp_file = Path(f.name)
        try:
-            # Act - Generate schema
+            # Act - Generate schema in syntactic mode
-            result = self.schema_generator.generate_schema_from_file(temp_file)
+            result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic')
            # Assert - Should capture comprehensive structure
            properties = result.get("properties", {})
--- a/tools/schema_summary.py
+++ b/tools/schema_summary.py
@@ -19,7 +19,7 @@ def generate_summary(file_path, ascii_mode=False):
    """Generate a concise 4-line summary of the document structure."""
    generator = SchemaGenerator()
-    schema = generator.generate_schema_from_file(Path(file_path))
+    schema = generator.generate_schema_from_file(Path(file_path), mode='syntactic')
    # Define icons based on mode
    if ascii_mode:
--- a/tools/visualize_schema.py
+++ b/tools/visualize_schema.py
@@ -20,7 +20,7 @@ def visualize_schema_structure(file_path, max_depth=None, ascii_only=False):
    """Create a beautiful tree visualization of the document structure."""
    generator = SchemaGenerator()
-    schema = generator.generate_schema_from_file(Path(file_path), max_depth=max_depth)
+    schema = generator.generate_schema_from_file(Path(file_path), max_depth=max_depth, mode='syntactic')
    # Define icons based on ASCII mode
    if ascii_only: