markitect-main/markitect/schema_generator.py

"""
Schema Generator for Issue #5: Generate a Schema from a Markdown File.

This module provides functionality to analyze markdown AST structures and generate
JSON schemas that describe the document's structural elements with configurable
depth limitations for architectural documentation analysis.
"""

import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Any, Optional, Set

from .parser import parse_markdown_to_ast
from .exceptions import FileNotFoundError, InvalidDepthError


class SchemaGenerator:
    """
    Generates JSON schemas from markdown file AST structures.

    Analyzes the structural elements of markdown documents and creates
    JSON schemas that can be used for validation and compliance checking
    in architecture documentation workflows.
    """

    def __init__(self):
        """Initialize the schema generator."""
        self.default_schema_url = "http://json-schema.org/draft-07/schema#"

    def generate_schema_from_file(
        self,
        file_path: Path,
        max_depth: Optional[int] = None,
        mode: Optional[str] = None,
        outline_depth: Optional[int] = None,
        capture_heading_text: bool = False
    ) -> Dict[str, Any]:
        """
        Generate a JSON schema from a markdown file's AST structure.

        Args:
            file_path: Path to the markdown file
            max_depth: Maximum heading depth to include (None = unlimited)
            mode: Generation mode ('outline' for structure-focused schemas)
            outline_depth: Depth limit for outline mode
            capture_heading_text: Whether to capture exact heading text as constraints

        Returns:
            JSON schema as a dictionary

        Raises:
            FileNotFoundError: If the markdown file doesn't exist
            InvalidDepthError: If max_depth is invalid (< 1)
        """
        # Validate inputs
        if not file_path.exists():
            raise FileNotFoundError(f"Markdown file not found: {file_path}")

        if max_depth is not None and max_depth < 1:
            raise InvalidDepthError(f"max_depth must be >= 1, got: {max_depth}")

        # Read and parse the markdown file
        content = file_path.read_text(encoding='utf-8')
        ast_tokens = parse_markdown_to_ast(content)

        # Analyze the AST structure
        structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)

        # Generate the JSON schema
        schema = self._create_json_schema(structure_analysis, file_path.name, mode=mode, outline_depth=outline_depth, capture_heading_text=capture_heading_text)

        return schema

    def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]:
        """
        Analyze AST tokens to extract structural patterns.

        Args:
            tokens: List of AST tokens from markdown-it
            max_depth: Maximum heading depth to analyze

        Returns:
            Dictionary containing structural analysis
        """
        analysis = {
            'headings': defaultdict(list),
            'paragraphs': [],
            'lists': [],
            'code_blocks': [],
            'blockquotes': [],
            'tables': [],
            'links': [],
            'images': [],
            'emphasis': [],
            'structure_types': set()
        }

        current_heading_level = 0
        i = 0

        while i < len(tokens):
            token = tokens[i]
            token_type = token.get('type', '')

            # Track all structural types found
            analysis['structure_types'].add(token_type)

            # Analyze headings with depth filtering
            if token_type == 'heading_open':
                level = self._extract_heading_level(token.get('tag', ''))
                if max_depth is None or level <= max_depth:
                    heading_content = self._extract_heading_content(tokens, i)
                    analysis['headings'][f'level_{level}'].append({
                        'content': heading_content,
                        'level': level,
                        'position': i
                    })
                    current_heading_level = level

            # Analyze paragraphs
            elif token_type == 'paragraph_open':
                paragraph_content = self._extract_paragraph_content(tokens, i)
                analysis['paragraphs'].append({
                    'content': paragraph_content,
                    'position': i,
                    'under_heading_level': current_heading_level
                })

            # Analyze lists
            elif token_type in ['bullet_list_open', 'ordered_list_open']:
                list_structure = self._extract_list_structure(tokens, i)
                analysis['lists'].append({
                    'type': 'bullet' if token_type == 'bullet_list_open' else 'ordered',
                    'structure': list_structure,
                    'position': i,
                    'under_heading_level': current_heading_level
                })

            # Analyze code blocks
            elif token_type == 'code_block' or token_type == 'fence':
                code_info = self._extract_code_block_info(token)
                analysis['code_blocks'].append({
                    'language': code_info.get('language', ''),
                    'content_length': len(code_info.get('content', '')),
                    'position': i,
                    'under_heading_level': current_heading_level
                })

            # Analyze blockquotes
            elif token_type == 'blockquote_open':
                quote_content = self._extract_blockquote_content(tokens, i)
                analysis['blockquotes'].append({
                    'content': quote_content,
                    'position': i,
                    'under_heading_level': current_heading_level
                })

            # Analyze tables
            elif token_type == 'table_open':
                table_structure = self._extract_table_structure(tokens, i)
                analysis['tables'].append({
                    'columns': table_structure.get('columns', 0),
                    'rows': table_structure.get('rows', 0),
                    'position': i,
                    'under_heading_level': current_heading_level
                })

            # Analyze inline elements
            elif token_type == 'inline':
                inline_analysis = self._analyze_inline_content(token)
                analysis['links'].extend(inline_analysis.get('links', []))
                analysis['images'].extend(inline_analysis.get('images', []))
                analysis['emphasis'].extend(inline_analysis.get('emphasis', []))

            i += 1

        # Convert sets to lists for JSON serialization
        analysis['structure_types'] = list(analysis['structure_types'])

        return analysis

    def _create_json_schema(
        self,
        analysis: Dict[str, Any],
        filename: str,
        mode: Optional[str] = None,
        outline_depth: Optional[int] = None,
        capture_heading_text: bool = False
    ) -> Dict[str, Any]:
        """
        Create a JSON schema from structural analysis.

        Args:
            analysis: Structural analysis of the document
            filename: Name of the source file
            mode: Generation mode ('outline' for structure-focused schemas)
            outline_depth: Depth limit for outline mode
            capture_heading_text: Whether to capture exact heading text as constraints

        Returns:
            JSON schema dictionary
        """
        # Determine title format based on mode
        title_preposition = "from" if mode == "outline" else "for"

        schema = {
            "$schema": self.default_schema_url,
            "type": "object",
            "title": f"Schema {title_preposition} {filename}",
            "description": f"JSON schema describing the structure of {filename}",
            "properties": {}
        }

        # Add metaschema extensions for outline mode
        if mode == "outline":
            schema["x-markitect-outline-mode"] = True
            if outline_depth is not None:
                schema["x-markitect-outline-depth"] = outline_depth

        # Add metaschema extension for heading text capture
        if capture_heading_text:
            schema["x-markitect-heading-text-capture"] = True

        # Add heading structure
        if analysis['headings']:
            heading_properties = {}
            for level_key, headings in analysis['headings'].items():
                if headings:  # Only include levels that have content
                    # Configure content property based on heading text capture
                    if capture_heading_text:
                        # Extract actual heading texts in document order
                        heading_texts = [heading['content'] for heading in headings]
                        content_property = {"enum": heading_texts}
                    else:
                        content_property = {"type": "string"}

                    heading_properties[level_key] = {
                        "type": "array",
                        "description": f"Headings at {level_key.replace('_', ' ')}",
                        "items": {
                            "type": "object",
                            "properties": {
                                "content": content_property,
                                "level": {"type": "integer"},
                                "position": {"type": "integer"}
                            },
                            "required": ["content", "level"]
                        },
                        "minItems": len(headings),
                        "maxItems": len(headings)
                    }

            if heading_properties:
                schema["properties"]["headings"] = {
                    "type": "object",
                    "description": "Document heading structure",
                    "properties": heading_properties
                }

        # Add other structural elements
        structural_elements = {
            "paragraphs": ("Text paragraphs", analysis['paragraphs']),
            "lists": ("Lists (ordered and unordered)", analysis['lists']),
            "code_blocks": ("Code blocks and fenced code", analysis['code_blocks']),
            "blockquotes": ("Block quotations", analysis['blockquotes']),
            "tables": ("Tables with rows and columns", analysis['tables']),
            "links": ("Links to external resources", analysis['links']),
            "images": ("Embedded images", analysis['images']),
            "emphasis": ("Text emphasis (bold, italic)", analysis['emphasis'])
        }

        for element_name, (description, element_list) in structural_elements.items():
            if element_list:
                schema["properties"][element_name] = {
                    "type": "array",
                    "description": description,
                    "minItems": len(element_list),
                    "maxItems": len(element_list)
                }

        # Add metadata
        schema["properties"]["metadata"] = {
            "type": "object",
            "description": "Document structure metadata",
            "properties": {
                "total_elements": {
                    "type": "integer",
                    "const": sum(len(v) if isinstance(v, list) else 0 for v in analysis.values())
                },
                "structure_types": {
                    "type": "array",
                    "items": {"type": "string"},
                    "description": "All structural element types found",
                    "const": analysis['structure_types']
                }
            }
        }

        return schema

    def _extract_heading_level(self, tag: str) -> int:
        """Extract heading level from HTML tag (h1, h2, etc.)."""
        if tag.startswith('h') and len(tag) == 2:
            try:
                return int(tag[1])
            except ValueError:
                pass
        return 1

    def _extract_heading_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
        """Extract text content from heading tokens."""
        # Look for the inline token that contains the heading text
        for i in range(start_index, min(start_index + 3, len(tokens))):
            token = tokens[i]
            if token.get('type') == 'inline':
                return token.get('content', '')
        return ''

    def _extract_paragraph_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
        """Extract text content from paragraph tokens."""
        # Look for the inline token that contains the paragraph text
        for i in range(start_index, min(start_index + 3, len(tokens))):
            token = tokens[i]
            if token.get('type') == 'inline':
                return token.get('content', '')
        return ''

    def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
        """Extract list structure information."""
        # This is a simplified implementation
        # In a full implementation, we'd parse the nested list structure
        return {
            "type": "list",
            "estimated_items": 1  # Placeholder - would need more complex parsing
        }

    def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]:
        """Extract code block information."""
        return {
            "language": token.get('info', '').split()[0] if token.get('info') else '',
            "content": token.get('content', '')
        }

    def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
        """Extract blockquote content."""
        # Simplified implementation
        return "blockquote content"

    def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
        """Extract table structure information."""
        # Simplified implementation
        return {
            "columns": 2,  # Placeholder
            "rows": 1      # Placeholder
        }

    def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]:
        """Analyze inline content for links, images, emphasis."""
        result = {
            "links": [],
            "images": [],
            "emphasis": []
        }

        # Analyze children tokens if they exist
        children = token.get('children', [])
        for child in children:
            if child and isinstance(child, dict):
                child_type = child.get('type', '')
                if child_type == 'link_open':
                    result['links'].append({"type": "link"})
                elif child_type == 'image':
                    result['images'].append({"type": "image"})
                elif child_type in ['em_open', 'strong_open']:
                    result['emphasis'].append({"type": child_type})

        return result