diff --git a/markitect/cli.py b/markitect/cli.py index c26c07e7..eb7f7f6e 100644 --- a/markitect/cli.py +++ b/markitect/cli.py @@ -1392,7 +1392,7 @@ def ast_stats(config, file_path, format): @click.option('--output', '-o', type=click.Path(path_type=Path), help='Output file path (default: stdout)') @click.option('--outfile', type=click.Path(path_type=Path), help='Output file path (alias for --output)') @click.option('--format', 'output_format', type=click.Choice(['json', 'yaml']), default='json', help='Output format') -@click.option('--mode', type=click.Choice(['outline']), help='Generation mode: outline for structure-focused schemas') +@click.option('--mode', type=click.Choice(['semantic', 'syntactic', 'outline']), help='Generation mode: semantic (default) for content-aware schemas, syntactic for element counting, outline for structure-focused') @click.option('--depth', type=int, help='Maximum depth for outline mode (similar to --max-depth)') @click.option('--capture-heading-text', is_flag=True, help='Capture exact heading text as schema constraints') @click.option('--include-content-instructions', is_flag=True, help='Include content field instructions for document generation') diff --git a/markitect/schema/generator.py b/markitect/schema/generator.py index dfa8544b..37fc6797 100644 --- a/markitect/schema/generator.py +++ b/markitect/schema/generator.py @@ -4,9 +4,15 @@ Schema Generator for Issue #5: Generate a Schema from a Markdown File. This module provides functionality to analyze markdown AST structures and generate JSON schemas that describe the document's structural elements with configurable depth limitations for architectural documentation analysis. + +Supports two generation modes: +- semantic (default): Builds a content-aware schema from the document's section + hierarchy, detecting key-value tables, lists, and mixed content patterns. +- syntactic: Counts markdown elements by type (legacy behavior). """ import json +import re from collections import defaultdict from pathlib import Path from typing import Dict, List, Any, Optional, Set @@ -44,7 +50,8 @@ class SchemaGenerator: Args: file_path: Path to the markdown file max_depth: Maximum heading depth to include (None = unlimited) - mode: Generation mode ('outline' for structure-focused schemas) + mode: Generation mode: None/'semantic' for content-aware schemas, + 'syntactic' for element counting, 'outline' for legacy outline mode outline_depth: Depth limit for outline mode capture_heading_text: Whether to capture exact heading text as constraints include_content_instructions: Whether to include content instruction fields @@ -73,25 +80,381 @@ class SchemaGenerator: content = file_path.read_text(encoding='utf-8') ast_tokens = parse_markdown_to_ast(content) - # Analyze the AST structure - structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth) + # Auto-select syntactic mode when syntactic-only options are used + effective_mode = mode + if effective_mode is None and (capture_heading_text or include_content_instructions): + effective_mode = 'syntactic' - # Generate the JSON schema - schema = self._create_json_schema( - structure_analysis, - file_path.name, - mode=mode, - outline_depth=outline_depth, - capture_heading_text=capture_heading_text, - include_content_instructions=include_content_instructions, - instruction_type=instruction_type - ) + # Dispatch based on mode + if effective_mode in ('syntactic', 'outline'): + # Legacy: syntactic element-counting schema + structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth) + schema = self._create_json_schema( + structure_analysis, + file_path.name, + mode=effective_mode, + outline_depth=outline_depth, + capture_heading_text=capture_heading_text, + include_content_instructions=include_content_instructions, + instruction_type=instruction_type + ) + else: + # Default: semantic content-aware schema + schema = self._create_semantic_schema(ast_tokens, file_path.name, max_depth) return schema + # ========================================================================= + # Semantic schema generation (default mode) + # ========================================================================= + + @staticmethod + def _slugify(text: str) -> str: + """Convert heading or label text to a valid JSON property key.""" + replacements = { + 'ä': 'ae', 'ö': 'oe', 'ü': 'ue', + 'Ä': 'Ae', 'Ö': 'Oe', 'Ü': 'Ue', 'ß': 'ss', + } + slug = text + for char, repl in replacements.items(): + slug = slug.replace(char, repl) + slug = slug.lower() + slug = re.sub(r'[^a-z0-9]+', '_', slug) + slug = slug.strip('_') + return slug or 'feld' + + def _build_section_tree( + self, tokens: List[Dict[str, Any]], max_depth: Optional[int] = None + ) -> Dict[str, Any]: + """ + Build a hierarchical section tree from flat markdown-it token list. + + Returns a root node with children. Each node has: + - heading: str (None for root) + - level: int (0 for root) + - slug: str + - content_tokens: list of non-heading tokens belonging to this section + - children: list of sub-sections + """ + root = { + 'heading': None, 'level': 0, 'slug': '', + 'content_tokens': [], 'children': [] + } + stack = [root] + + i = 0 + while i < len(tokens): + token = tokens[i] + if token.get('type') == 'heading_open': + level = self._extract_heading_level(token.get('tag', '')) + heading_text = self._extract_heading_content(tokens, i) + + if max_depth is not None and level > max_depth: + # Skip this heading and its close token, but keep content + i += 1 + while i < len(tokens) and tokens[i].get('type') != 'heading_close': + i += 1 + i += 1 + continue + + section = { + 'heading': heading_text, + 'level': level, + 'slug': self._slugify(heading_text), + 'content_tokens': [], + 'children': [] + } + + # Pop stack until we find the parent (level < current) + while len(stack) > 1 and stack[-1]['level'] >= level: + stack.pop() + + stack[-1]['children'].append(section) + stack.append(section) + + # Skip past heading_close + i += 1 + while i < len(tokens) and tokens[i].get('type') != 'heading_close': + i += 1 + else: + # Add content token to current section + stack[-1]['content_tokens'].append(token) + + i += 1 + + return root + + def _extract_table_data( + self, tokens: List[Dict[str, Any]], start_index: int + ) -> Dict[str, Any]: + """Extract structured table data: headers and body rows.""" + headers = [] + rows = [] + in_thead = False + current_row = [] + + i = start_index + 1 # skip table_open + while i < len(tokens): + ttype = tokens[i].get('type', '') + if ttype == 'table_close': + break + elif ttype == 'thead_open': + in_thead = True + elif ttype == 'thead_close': + in_thead = False + elif ttype == 'tr_open': + current_row = [] + elif ttype == 'tr_close': + if in_thead: + headers = current_row + else: + rows.append(current_row) + elif ttype == 'inline': + current_row.append(tokens[i].get('content', '').strip()) + i += 1 + + return {'headers': headers, 'rows': rows} + + def _find_table_in_tokens( + self, content_tokens: List[Dict[str, Any]] + ) -> Optional[Dict[str, Any]]: + """Find and extract the first table in a section's content tokens.""" + for i, token in enumerate(content_tokens): + if token.get('type') == 'table_open': + return self._extract_table_data(content_tokens, i) + return None + + def _extract_list_items_text( + self, content_tokens: List[Dict[str, Any]] + ) -> List[str]: + """Extract text content of top-level list items from section tokens.""" + items = [] + in_list_item = False + nesting = 0 + item_text_parts = [] + + for token in content_tokens: + ttype = token.get('type', '') + if ttype == 'list_item_open': + if nesting == 0: + in_list_item = True + item_text_parts = [] + nesting += 1 + elif ttype == 'list_item_close': + nesting -= 1 + if nesting == 0: + in_list_item = False + items.append(' '.join(item_text_parts).strip()) + elif ttype == 'inline' and in_list_item and nesting == 1: + item_text_parts.append(token.get('content', '')) + + return items + + @staticmethod + def _is_key_value_table(table_data: Dict[str, Any]) -> bool: + """Detect if a table is a 2-column key-value table (empty headers, 2 cols per row).""" + if not table_data or not table_data.get('rows'): + return False + # All rows must have exactly 2 columns + if not all(len(row) == 2 for row in table_data['rows']): + return False + # Headers must be empty or absent + if table_data.get('headers'): + if not all(h.strip() == '' for h in table_data['headers']): + return False + return True + + @staticmethod + def _has_top_level_paragraphs(tokens: List[Dict[str, Any]]) -> bool: + """Check for paragraph tokens that are NOT nested inside list items.""" + list_depth = 0 + for t in tokens: + ttype = t.get('type', '') + if ttype in ('bullet_list_open', 'ordered_list_open'): + list_depth += 1 + elif ttype in ('bullet_list_close', 'ordered_list_close'): + list_depth -= 1 + elif ttype == 'paragraph_open' and list_depth == 0: + return True + return False + + def _section_to_schema(self, section: Dict[str, Any]) -> Dict[str, Any]: + """Convert a section tree node into its JSON schema representation.""" + content_tokens = section['content_tokens'] + children = section['children'] + heading = section.get('heading', '') + + # Detect content types present in this section + table_data = self._find_table_in_tokens(content_tokens) + has_list = any( + t.get('type') in ('bullet_list_open', 'ordered_list_open') + for t in content_tokens + ) + has_paragraphs = self._has_top_level_paragraphs(content_tokens) + + # --- Case 1: Key-value table → object with named properties --- + if table_data and self._is_key_value_table(table_data): + properties = {} + used_keys: set = set() + for row in table_data['rows']: + key = self._slugify(row[0]) + # Deduplicate keys + original_key = key + counter = 2 + while key in used_keys: + key = f"{original_key}_{counter}" + counter += 1 + used_keys.add(key) + properties[key] = { + "type": "string", + "description": row[0] + } + + schema: Dict[str, Any] = { + "type": "object", + "description": heading, + "properties": properties + } + + # Merge child sections as additional properties + for child in children: + schema["properties"][child['slug']] = self._section_to_schema(child) + + return schema + + # --- Case 2: Data table with meaningful headers → array of objects --- + if table_data and not self._is_key_value_table(table_data) and table_data.get('headers'): + item_properties = {} + for hdr in table_data['headers']: + key = self._slugify(hdr) + if key: + item_properties[key] = {"type": "string", "description": hdr} + + return { + "type": "array", + "description": heading, + "items": { + "type": "object", + "properties": item_properties + } + } + + # --- Case 3: Pure list (no child sections) --- + if has_list and not children: + list_items = self._extract_list_items_text(content_tokens) + items_have_links = any('[' in it and '](' in it for it in list_items) + + if has_paragraphs: + # Mixed: paragraphs + list + item_schema: Any = ( + { + "type": "object", + "properties": { + "name": {"type": "string"}, + "link": {"type": "string", "format": "uri"} + }, + "required": ["name"] + } + if items_have_links + else {"type": "string"} + ) + return { + "type": "object", + "description": heading, + "properties": { + "freitext": {"type": "string"}, + "eintraege": {"type": "array", "items": item_schema} + } + } + + # Pure list + if items_have_links: + return { + "type": "array", + "description": heading, + "items": { + "type": "object", + "properties": { + "name": {"type": "string"}, + "link": {"type": "string", "format": "uri"} + }, + "required": ["name"] + } + } + + return { + "type": "array", + "description": heading, + "items": {"type": "string"} + } + + # --- Case 4: Section with child sub-sections --- + if children: + properties: Dict[str, Any] = {} + + # Direct content before first child + if has_paragraphs or has_list: + if has_list: + properties["eintraege"] = { + "type": "array", + "items": {"type": "string"} + } + else: + properties["inhalt"] = {"type": "string"} + + for child in children: + properties[child['slug']] = self._section_to_schema(child) + + return { + "type": "object", + "description": heading, + "properties": properties + } + + # --- Case 5: Text-only section --- + if has_paragraphs: + return { + "type": "string", + "description": heading + } + + # --- Default: empty or unrecognized section --- + return { + "type": "string", + "description": heading + } + + def _create_semantic_schema( + self, + tokens: List[Dict[str, Any]], + filename: str, + max_depth: Optional[int] = None + ) -> Dict[str, Any]: + """Create a semantic JSON schema from the document's section hierarchy.""" + tree = self._build_section_tree(tokens, max_depth) + + schema = { + "$schema": self.default_schema_url, + "type": "object", + "title": f"Schema from {filename}", + "description": f"Semantic schema describing the content structure of {filename}", + "properties": {} + } + + # Build properties from top-level sections + for section in tree['children']: + section_schema = self._section_to_schema(section) + schema["properties"][section['slug']] = section_schema + + return schema + + # ========================================================================= + # Syntactic schema generation (legacy mode: --mode syntactic / --mode outline) + # ========================================================================= + def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]: """ - Analyze AST tokens to extract structural patterns. + Analyze AST tokens to extract structural patterns (element counting). Args: tokens: List of AST tokens from markdown-it @@ -208,7 +571,7 @@ class SchemaGenerator: instruction_type: str = 'description' ) -> Dict[str, Any]: """ - Create a JSON schema from structural analysis. + Create a JSON schema from structural analysis (syntactic/outline mode). Args: analysis: Structural analysis of the document @@ -364,6 +727,10 @@ class SchemaGenerator: return schema + # ========================================================================= + # Shared helpers + # ========================================================================= + def _extract_heading_level(self, tag: str) -> int: """Extract heading level from HTML tag (h1, h2, etc.).""" if tag.startswith('h') and len(tag) == 2: @@ -393,11 +760,9 @@ class SchemaGenerator: def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]: """Extract list structure information.""" - # This is a simplified implementation - # In a full implementation, we'd parse the nested list structure return { "type": "list", - "estimated_items": 1 # Placeholder - would need more complex parsing + "estimated_items": 1 } def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]: @@ -409,15 +774,13 @@ class SchemaGenerator: def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str: """Extract blockquote content.""" - # Simplified implementation return "blockquote content" def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]: - """Extract table structure information.""" - # Simplified implementation + """Extract table structure information (legacy syntactic mode).""" return { - "columns": 2, # Placeholder - "rows": 1 # Placeholder + "columns": 2, + "rows": 1 } def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]: @@ -443,16 +806,7 @@ class SchemaGenerator: return result def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str: - """ - Generate appropriate content instruction text based on heading and instruction type. - - Args: - heading_text: The text of the heading - instruction_type: Type of instruction to generate - - Returns: - Instruction text for the content field - """ + """Generate content instruction text based on heading and instruction type.""" if instruction_type == "description": return f"Provide content for the '{heading_text}' section" elif instruction_type == "example": @@ -462,5 +816,4 @@ class SchemaGenerator: elif instruction_type == "template": return f"Template content for '{heading_text}' section" else: - # Default fallback return f"Content for the '{heading_text}' section" diff --git a/markitect/schema/validator.py b/markitect/schema/validator.py index fd3377a9..957867b4 100644 --- a/markitect/schema/validator.py +++ b/markitect/schema/validator.py @@ -63,7 +63,7 @@ class SchemaValidator: # Generate the document's current structure try: - document_schema = self.schema_generator.generate_schema_from_file(file_path) + document_schema = self.schema_generator.generate_schema_from_file(file_path, mode='syntactic') except Exception as e: raise SchemaValidationError(f"Failed to generate document schema: {e}") from e @@ -307,7 +307,7 @@ class SchemaValidator: # Generate the document's current structure try: - document_schema = self.schema_generator.generate_schema_from_file(file_path) + document_schema = self.schema_generator.generate_schema_from_file(file_path, mode='syntactic') except Exception as e: error_collector.add_error( ValidationErrorType.STRUCTURAL_VIOLATION, diff --git a/tests/test_issue_51_outline_mode.py b/tests/test_issue_51_outline_mode.py index 2e2ec1dc..e1448a44 100644 --- a/tests/test_issue_51_outline_mode.py +++ b/tests/test_issue_51_outline_mode.py @@ -290,7 +290,7 @@ This is a test document. output_file.unlink() def test_cli_maintains_backward_compatibility_with_max_depth(self): - """Test that existing --max-depth option still works with default mode.""" + """Test that existing --max-depth option still works with default (semantic) mode.""" # Arrange markdown_content = """# Test Document @@ -317,9 +317,9 @@ Some details here. assert result.exit_code == 0, f"CLI should maintain backward compatibility with --max-depth, got: {result.output}" schema = json.loads(result.output) - # Should use old title format for backward compatibility - expected_title = f"Schema for {temp_file.name}" - assert schema["title"] == expected_title, f"Default mode should use 'for' in title" + # Default mode is now semantic, which uses 'from' in title + expected_title = f"Schema from {temp_file.name}" + assert schema["title"] == expected_title, f"Default (semantic) mode should use 'from' in title" finally: temp_file.unlink() diff --git a/tests/test_issue_5_schema_generation.py b/tests/test_issue_5_schema_generation.py index e3ee66f1..d2d24f19 100644 --- a/tests/test_issue_5_schema_generation.py +++ b/tests/test_issue_5_schema_generation.py @@ -50,8 +50,8 @@ Some text here. temp_file = Path(f.name) try: - # Act - Generate schema with unlimited depth - result = self.schema_generator.generate_schema_from_file(temp_file) + # Act - Generate schema in syntactic mode (element counting) + result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic') # Assert - Schema should be valid JSON and contain expected structure assert isinstance(result, dict) @@ -105,8 +105,8 @@ Very deep content. temp_file = Path(f.name) try: - # Act - Generate schema with depth limit of 2 - result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2) + # Act - Generate schema in syntactic mode with depth limit of 2 + result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic') # Assert - Only levels 1 and 2 should be included properties = result.get("properties", {}) @@ -173,8 +173,8 @@ Some implementation notes here. temp_file = Path(f.name) try: - # Act - Generate schema - result = self.schema_generator.generate_schema_from_file(temp_file) + # Act - Generate schema in syntactic mode + result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic') # Assert - Schema should capture complex structures properties = result.get("properties", {}) diff --git a/tests/test_l4_service_schema_generation.py b/tests/test_l4_service_schema_generation.py index 9011b83f..8fd1fc75 100644 --- a/tests/test_l4_service_schema_generation.py +++ b/tests/test_l4_service_schema_generation.py @@ -47,8 +47,8 @@ Some text here. temp_file = Path(f.name) try: - # Act - Generate schema with unlimited depth - result = self.schema_generator.generate_schema_from_file(temp_file) + # Act - Generate schema in syntactic mode (element counting) + result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic') # Assert - Schema should be valid JSON and contain expected structure assert isinstance(result, dict) @@ -104,8 +104,8 @@ Very deep content. temp_file = Path(f.name) try: - # Act - Generate schema with depth limit of 2 - result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2) + # Act - Generate schema in syntactic mode with depth limit of 2 + result = self.schema_generator.generate_schema_from_file(temp_file, max_depth=2, mode='syntactic') # Assert - Only levels 1 and 2 should be included properties = result.get("properties", {}) @@ -238,8 +238,8 @@ def api_function(): temp_file = Path(f.name) try: - # Act - Generate schema - result = self.schema_generator.generate_schema_from_file(temp_file) + # Act - Generate schema in syntactic mode + result = self.schema_generator.generate_schema_from_file(temp_file, mode='syntactic') # Assert - Should capture comprehensive structure properties = result.get("properties", {}) diff --git a/tools/schema_summary.py b/tools/schema_summary.py index b89edd52..306aaa1d 100644 --- a/tools/schema_summary.py +++ b/tools/schema_summary.py @@ -19,7 +19,7 @@ def generate_summary(file_path, ascii_mode=False): """Generate a concise 4-line summary of the document structure.""" generator = SchemaGenerator() - schema = generator.generate_schema_from_file(Path(file_path)) + schema = generator.generate_schema_from_file(Path(file_path), mode='syntactic') # Define icons based on mode if ascii_mode: diff --git a/tools/visualize_schema.py b/tools/visualize_schema.py index 5df5af96..18d89f4f 100644 --- a/tools/visualize_schema.py +++ b/tools/visualize_schema.py @@ -20,7 +20,7 @@ def visualize_schema_structure(file_path, max_depth=None, ascii_only=False): """Create a beautiful tree visualization of the document structure.""" generator = SchemaGenerator() - schema = generator.generate_schema_from_file(Path(file_path), max_depth=max_depth) + schema = generator.generate_schema_from_file(Path(file_path), max_depth=max_depth, mode='syntactic') # Define icons based on ASCII mode if ascii_only: