feat(spaces): implement Phase 0-1 of Information Space Service

Phase 0 - Project Organization: - Create docs/PROJECT_STRUCTURE.md documenting codebase layout - Create markitect/core/ with parser, serializer, document_manager, workspace - Create markitect/schema/ consolidating 6 schema_*.py modules - Create markitect/storage/ with database module - Maintain backward compatibility via re-exports from original locations - Add docs/roadmap/information-space-service/ with README and WORKPLAN Phase 1 - Foundation (Weeks 1-3): - Week 1: Core domain models (InformationSpace, SpaceDocument, SpaceConfig, SpaceMetadata, SpaceVariable, TransclusionReference, SpaceStatus) - Week 2: Repository layer with interfaces (ISpaceRepository, IDocumentAssociationRepository, IVariableRepository, IReferenceRepository) and SQLite implementations with foreign key cascade deletes - Week 3: SpaceService orchestration layer with full CRUD, document, variable, and reference tracking operations Test coverage: 124 tests (25 model + 63 repository + 36 integration) Capabilities delivered: - CAP-001: InformationSpace entity with lifecycle management - CAP-002: SpaceRepository CRUD with SQLite backing - CAP-003: Document-Space associations with path-based organization - CAP-004: Space metadata and configuration schemas - CAP-005: Database schema with migrations Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-08 02:02:46 +01:00
parent 6ebcc0f60e
commit 9b12875681
45 changed files with 9818 additions and 4300 deletions
--- a/markitect/schema/generator.py
+++ b/markitect/schema/generator.py
@@ -0,0 +1,466 @@
+"""
+Schema Generator for Issue #5: Generate a Schema from a Markdown File.
+
+This module provides functionality to analyze markdown AST structures and generate
+JSON schemas that describe the document's structural elements with configurable
+depth limitations for architectural documentation analysis.
+"""
+
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Set
+
+from markitect.core.parser import parse_markdown_to_ast
+from markitect.exceptions import FileNotFoundError, InvalidDepthError, InvalidInstructionTypeError
+
+
+class SchemaGenerator:
+    """
+    Generates JSON schemas from markdown file AST structures.
+
+    Analyzes the structural elements of markdown documents and creates
+    JSON schemas that can be used for validation and compliance checking
+    in architecture documentation workflows.
+    """
+
+    def __init__(self):
+        """Initialize the schema generator."""
+        self.default_schema_url = "http://json-schema.org/draft-07/schema#"
+
+    def generate_schema_from_file(
+        self,
+        file_path: Path,
+        max_depth: Optional[int] = None,
+        mode: Optional[str] = None,
+        outline_depth: Optional[int] = None,
+        capture_heading_text: bool = False,
+        include_content_instructions: bool = False,
+        instruction_type: str = 'description'
+    ) -> Dict[str, Any]:
+        """
+        Generate a JSON schema from a markdown file's AST structure.
+
+        Args:
+            file_path: Path to the markdown file
+            max_depth: Maximum heading depth to include (None = unlimited)
+            mode: Generation mode ('outline' for structure-focused schemas)
+            outline_depth: Depth limit for outline mode
+            capture_heading_text: Whether to capture exact heading text as constraints
+            include_content_instructions: Whether to include content instruction fields
+            instruction_type: Type of content instructions ('description', 'example', 'constraint', 'template')
+
+        Returns:
+            JSON schema as a dictionary
+
+        Raises:
+            FileNotFoundError: If the markdown file doesn't exist
+            InvalidDepthError: If max_depth is invalid (< 1)
+        """
+        # Validate inputs
+        if not file_path.exists():
+            raise FileNotFoundError(f"Markdown file not found: {file_path}")
+
+        if max_depth is not None and max_depth < 1:
+            raise InvalidDepthError(f"max_depth must be >= 1, got: {max_depth}")
+
+        # Validate instruction type
+        valid_instruction_types = {'description', 'example', 'constraint', 'template'}
+        if instruction_type not in valid_instruction_types:
+            raise InvalidInstructionTypeError(f"Invalid instruction type '{instruction_type}'. Must be one of: {', '.join(valid_instruction_types)}")
+
+        # Read and parse the markdown file
+        content = file_path.read_text(encoding='utf-8')
+        ast_tokens = parse_markdown_to_ast(content)
+
+        # Analyze the AST structure
+        structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
+
+        # Generate the JSON schema
+        schema = self._create_json_schema(
+            structure_analysis,
+            file_path.name,
+            mode=mode,
+            outline_depth=outline_depth,
+            capture_heading_text=capture_heading_text,
+            include_content_instructions=include_content_instructions,
+            instruction_type=instruction_type
+        )
+
+        return schema
+
+    def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]:
+        """
+        Analyze AST tokens to extract structural patterns.
+
+        Args:
+            tokens: List of AST tokens from markdown-it
+            max_depth: Maximum heading depth to analyze
+
+        Returns:
+            Dictionary containing structural analysis
+        """
+        analysis = {
+            'headings': defaultdict(list),
+            'paragraphs': [],
+            'lists': [],
+            'code_blocks': [],
+            'blockquotes': [],
+            'tables': [],
+            'links': [],
+            'images': [],
+            'emphasis': [],
+            'structure_types': set()
+        }
+
+        current_heading_level = 0
+        i = 0
+
+        while i < len(tokens):
+            token = tokens[i]
+            token_type = token.get('type', '')
+
+            # Track all structural types found
+            analysis['structure_types'].add(token_type)
+
+            # Analyze headings with depth filtering
+            if token_type == 'heading_open':
+                level = self._extract_heading_level(token.get('tag', ''))
+                if max_depth is None or level <= max_depth:
+                    heading_content = self._extract_heading_content(tokens, i)
+                    analysis['headings'][f'level_{level}'].append({
+                        'content': heading_content,
+                        'level': level,
+                        'position': i
+                    })
+                    current_heading_level = level
+
+            # Analyze paragraphs
+            elif token_type == 'paragraph_open':
+                paragraph_content = self._extract_paragraph_content(tokens, i)
+                analysis['paragraphs'].append({
+                    'content': paragraph_content,
+                    'position': i,
+                    'under_heading_level': current_heading_level
+                })
+
+            # Analyze lists
+            elif token_type in ['bullet_list_open', 'ordered_list_open']:
+                list_structure = self._extract_list_structure(tokens, i)
+                analysis['lists'].append({
+                    'type': 'bullet' if token_type == 'bullet_list_open' else 'ordered',
+                    'structure': list_structure,
+                    'position': i,
+                    'under_heading_level': current_heading_level
+                })
+
+            # Analyze code blocks
+            elif token_type == 'code_block' or token_type == 'fence':
+                code_info = self._extract_code_block_info(token)
+                analysis['code_blocks'].append({
+                    'language': code_info.get('language', ''),
+                    'content_length': len(code_info.get('content', '')),
+                    'position': i,
+                    'under_heading_level': current_heading_level
+                })
+
+            # Analyze blockquotes
+            elif token_type == 'blockquote_open':
+                quote_content = self._extract_blockquote_content(tokens, i)
+                analysis['blockquotes'].append({
+                    'content': quote_content,
+                    'position': i,
+                    'under_heading_level': current_heading_level
+                })
+
+            # Analyze tables
+            elif token_type == 'table_open':
+                table_structure = self._extract_table_structure(tokens, i)
+                analysis['tables'].append({
+                    'columns': table_structure.get('columns', 0),
+                    'rows': table_structure.get('rows', 0),
+                    'position': i,
+                    'under_heading_level': current_heading_level
+                })
+
+            # Analyze inline elements
+            elif token_type == 'inline':
+                inline_analysis = self._analyze_inline_content(token)
+                analysis['links'].extend(inline_analysis.get('links', []))
+                analysis['images'].extend(inline_analysis.get('images', []))
+                analysis['emphasis'].extend(inline_analysis.get('emphasis', []))
+
+            i += 1
+
+        # Convert sets to lists for JSON serialization
+        analysis['structure_types'] = list(analysis['structure_types'])
+
+        return analysis
+
+    def _create_json_schema(
+        self,
+        analysis: Dict[str, Any],
+        filename: str,
+        mode: Optional[str] = None,
+        outline_depth: Optional[int] = None,
+        capture_heading_text: bool = False,
+        include_content_instructions: bool = False,
+        instruction_type: str = 'description'
+    ) -> Dict[str, Any]:
+        """
+        Create a JSON schema from structural analysis.
+
+        Args:
+            analysis: Structural analysis of the document
+            filename: Name of the source file
+            mode: Generation mode ('outline' for structure-focused schemas)
+            outline_depth: Depth limit for outline mode
+            capture_heading_text: Whether to capture exact heading text as constraints
+            include_content_instructions: Whether to include content instruction fields
+            instruction_type: Type of content instructions to generate
+
+        Returns:
+            JSON schema dictionary
+        """
+        # Determine title format based on mode
+        title_preposition = "from" if mode == "outline" else "for"
+
+        schema = {
+            "$schema": self.default_schema_url,
+            "type": "object",
+            "title": f"Schema {title_preposition} {filename}",
+            "description": f"JSON schema describing the structure of {filename}",
+            "properties": {}
+        }
+
+        # Add metaschema extensions for outline mode
+        if mode == "outline":
+            schema["x-markitect-outline-mode"] = True
+            if outline_depth is not None:
+                schema["x-markitect-outline-depth"] = outline_depth
+
+        # Add metaschema extension for heading text capture
+        if capture_heading_text:
+            schema["x-markitect-heading-text-capture"] = True
+
+        # Add metaschema extension for content instructions
+        if include_content_instructions:
+            schema["x-markitect-content-instructions-enabled"] = True
+
+        # Add heading structure
+        if analysis['headings']:
+            heading_properties = {}
+            for level_key, headings in analysis['headings'].items():
+                if headings:  # Only include levels that have content
+                    # Configure content property based on heading text capture
+                    if capture_heading_text:
+                        # Extract actual heading texts in document order
+                        heading_texts = [heading['content'] for heading in headings]
+                        content_property = {"enum": heading_texts}
+                    else:
+                        content_property = {"type": "string"}
+
+                    # Build properties for the heading item
+                    item_properties = {
+                        "content": content_property,
+                        "level": {"type": "integer"},
+                        "position": {"type": "integer"}
+                    }
+
+                    # Add content instruction fields if enabled
+                    if include_content_instructions:
+                        # Generate appropriate instruction text based on heading level
+                        level_num = int(level_key.split('_')[1])
+                        section_name = f"level {level_num} heading"
+                        instruction_text = self._generate_content_instruction(section_name, instruction_type)
+
+                        item_properties["x-markitect-content-instructions"] = {
+                            "type": "string",
+                            "const": instruction_text
+                        }
+
+                        item_properties["x-markitect-instruction-type"] = {
+                            "type": "string",
+                            "enum": [instruction_type]
+                        }
+
+                    heading_properties[level_key] = {
+                        "type": "array",
+                        "description": f"Headings at {level_key.replace('_', ' ')}",
+                        "items": {
+                            "type": "object",
+                            "properties": item_properties,
+                            "required": ["content", "level"]
+                        },
+                        "minItems": len(headings),
+                        "maxItems": len(headings)
+                    }
+
+            if heading_properties:
+                schema["properties"]["headings"] = {
+                    "type": "object",
+                    "description": "Document heading structure",
+                    "properties": heading_properties
+                }
+
+        # Add other structural elements
+        structural_elements = {
+            "paragraphs": ("Text paragraphs", analysis['paragraphs']),
+            "lists": ("Lists (ordered and unordered)", analysis['lists']),
+            "code_blocks": ("Code blocks and fenced code", analysis['code_blocks']),
+            "blockquotes": ("Block quotations", analysis['blockquotes']),
+            "tables": ("Tables with rows and columns", analysis['tables']),
+            "links": ("Links to external resources", analysis['links']),
+            "images": ("Embedded images", analysis['images']),
+            "emphasis": ("Text emphasis (bold, italic)", analysis['emphasis'])
+        }
+
+        for element_name, (description, element_list) in structural_elements.items():
+            if element_list:
+                # Build base schema for the element
+                element_schema = {
+                    "type": "array",
+                    "description": description,
+                    "minItems": len(element_list),
+                    "maxItems": len(element_list)
+                }
+
+                # Add content instructions for paragraphs and lists if enabled
+                if include_content_instructions and element_name in ["paragraphs", "lists"]:
+                    element_schema["items"] = {
+                        "type": "object",
+                        "properties": {
+                            "content": {"type": "string"},
+                            "x-markitect-content-instructions": {
+                                "type": "string",
+                                "const": self._generate_content_instruction(element_name, instruction_type)
+                            },
+                            "x-markitect-instruction-type": {
+                                "type": "string",
+                                "enum": [instruction_type]
+                            }
+                        }
+                    }
+
+                schema["properties"][element_name] = element_schema
+
+        # Add metadata
+        schema["properties"]["metadata"] = {
+            "type": "object",
+            "description": "Document structure metadata",
+            "properties": {
+                "total_elements": {
+                    "type": "integer",
+                    "const": sum(len(v) if isinstance(v, list) else 0 for v in analysis.values())
+                },
+                "structure_types": {
+                    "type": "array",
+                    "items": {"type": "string"},
+                    "description": "All structural element types found",
+                    "const": analysis['structure_types']
+                }
+            }
+        }
+
+        return schema
+
+    def _extract_heading_level(self, tag: str) -> int:
+        """Extract heading level from HTML tag (h1, h2, etc.)."""
+        if tag.startswith('h') and len(tag) == 2:
+            try:
+                return int(tag[1])
+            except ValueError:
+                pass
+        return 1
+
+    def _extract_heading_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
+        """Extract text content from heading tokens."""
+        # Look for the inline token that contains the heading text
+        for i in range(start_index, min(start_index + 3, len(tokens))):
+            token = tokens[i]
+            if token.get('type') == 'inline':
+                return token.get('content', '')
+        return ''
+
+    def _extract_paragraph_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
+        """Extract text content from paragraph tokens."""
+        # Look for the inline token that contains the paragraph text
+        for i in range(start_index, min(start_index + 3, len(tokens))):
+            token = tokens[i]
+            if token.get('type') == 'inline':
+                return token.get('content', '')
+        return ''
+
+    def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
+        """Extract list structure information."""
+        # This is a simplified implementation
+        # In a full implementation, we'd parse the nested list structure
+        return {
+            "type": "list",
+            "estimated_items": 1  # Placeholder - would need more complex parsing
+        }
+
+    def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract code block information."""
+        return {
+            "language": token.get('info', '').split()[0] if token.get('info') else '',
+            "content": token.get('content', '')
+        }
+
+    def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
+        """Extract blockquote content."""
+        # Simplified implementation
+        return "blockquote content"
+
+    def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
+        """Extract table structure information."""
+        # Simplified implementation
+        return {
+            "columns": 2,  # Placeholder
+            "rows": 1      # Placeholder
+        }
+
+    def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]:
+        """Analyze inline content for links, images, emphasis."""
+        result = {
+            "links": [],
+            "images": [],
+            "emphasis": []
+        }
+
+        # Analyze children tokens if they exist
+        children = token.get('children', [])
+        for child in children:
+            if child and isinstance(child, dict):
+                child_type = child.get('type', '')
+                if child_type == 'link_open':
+                    result['links'].append({"type": "link"})
+                elif child_type == 'image':
+                    result['images'].append({"type": "image"})
+                elif child_type in ['em_open', 'strong_open']:
+                    result['emphasis'].append({"type": child_type})
+
+        return result
+
+    def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str:
+        """
+        Generate appropriate content instruction text based on heading and instruction type.
+
+        Args:
+            heading_text: The text of the heading
+            instruction_type: Type of instruction to generate
+
+        Returns:
+            Instruction text for the content field
+        """
+        if instruction_type == "description":
+            return f"Provide content for the '{heading_text}' section"
+        elif instruction_type == "example":
+            return f"Example content for the '{heading_text}' section"
+        elif instruction_type == "constraint":
+            return f"Content must be relevant to '{heading_text}'"
+        elif instruction_type == "template":
+            return f"Template content for '{heading_text}' section"
+        else:
+            # Default fallback
+            return f"Content for the '{heading_text}' section"