Files
markitect-main/markitect/schema_generator.py
tegwick 0004fa2a0f feat: Implement Issue #54 - Add content field instruction capabilities
This implementation adds comprehensive support for content field instructions
that provide guidance for document generation from schemas.

## Key Features Added:

### CLI Options
- `--include-content-instructions` flag to enable content instruction fields
- `--instruction-type` parameter with options: description, example, constraint, template
- Full integration with existing outline mode and heading text capture features

### Schema Generation Enhancements
- Content instruction fields (x-markitect-content-instructions) with contextual guidance text
- Instruction type metadata (x-markitect-instruction-type) for type specification
- Metaschema extension (x-markitect-content-instructions-enabled) for feature detection
- Support for headings, paragraphs, and lists content instructions

### Error Handling
- InvalidInstructionTypeError for robust validation of instruction type parameters
- Comprehensive input validation with clear error messages

### Integration and Compatibility
- Seamless integration with outline mode and heading text capture
- Full backward compatibility - existing behavior unchanged when feature disabled
- Works with all existing CLI options and modes

### Documentation
- Updated CLI help with examples and detailed feature descriptions
- Clear documentation of all instruction types and their purposes

## Technical Implementation:
- Enhanced SchemaGenerator with content instruction generation logic
- Added `_generate_content_instruction` method for contextual instruction text
- Extended schema structure to include instruction metadata
- Maintained clean separation of concerns and existing code patterns

## Testing and Validation:
- Comprehensive test coverage following TDD8 methodology
- All existing functionality preserved and tested
- Integration tests for all feature combinations
- Error handling and edge case validation

This completes Issue #54 with full feature implementation, documentation,
and comprehensive testing coverage.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-01 08:21:42 +02:00

466 lines
19 KiB
Python

"""
Schema Generator for Issue #5: Generate a Schema from a Markdown File.
This module provides functionality to analyze markdown AST structures and generate
JSON schemas that describe the document's structural elements with configurable
depth limitations for architectural documentation analysis.
"""
import json
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Any, Optional, Set
from .parser import parse_markdown_to_ast
from .exceptions import FileNotFoundError, InvalidDepthError, InvalidInstructionTypeError
class SchemaGenerator:
"""
Generates JSON schemas from markdown file AST structures.
Analyzes the structural elements of markdown documents and creates
JSON schemas that can be used for validation and compliance checking
in architecture documentation workflows.
"""
def __init__(self):
"""Initialize the schema generator."""
self.default_schema_url = "http://json-schema.org/draft-07/schema#"
def generate_schema_from_file(
self,
file_path: Path,
max_depth: Optional[int] = None,
mode: Optional[str] = None,
outline_depth: Optional[int] = None,
capture_heading_text: bool = False,
include_content_instructions: bool = False,
instruction_type: str = 'description'
) -> Dict[str, Any]:
"""
Generate a JSON schema from a markdown file's AST structure.
Args:
file_path: Path to the markdown file
max_depth: Maximum heading depth to include (None = unlimited)
mode: Generation mode ('outline' for structure-focused schemas)
outline_depth: Depth limit for outline mode
capture_heading_text: Whether to capture exact heading text as constraints
include_content_instructions: Whether to include content instruction fields
instruction_type: Type of content instructions ('description', 'example', 'constraint', 'template')
Returns:
JSON schema as a dictionary
Raises:
FileNotFoundError: If the markdown file doesn't exist
InvalidDepthError: If max_depth is invalid (< 1)
"""
# Validate inputs
if not file_path.exists():
raise FileNotFoundError(f"Markdown file not found: {file_path}")
if max_depth is not None and max_depth < 1:
raise InvalidDepthError(f"max_depth must be >= 1, got: {max_depth}")
# Validate instruction type
valid_instruction_types = {'description', 'example', 'constraint', 'template'}
if instruction_type not in valid_instruction_types:
raise InvalidInstructionTypeError(f"Invalid instruction type '{instruction_type}'. Must be one of: {', '.join(valid_instruction_types)}")
# Read and parse the markdown file
content = file_path.read_text(encoding='utf-8')
ast_tokens = parse_markdown_to_ast(content)
# Analyze the AST structure
structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
# Generate the JSON schema
schema = self._create_json_schema(
structure_analysis,
file_path.name,
mode=mode,
outline_depth=outline_depth,
capture_heading_text=capture_heading_text,
include_content_instructions=include_content_instructions,
instruction_type=instruction_type
)
return schema
def _analyze_ast_structure(self, tokens: List[Dict[str, Any]], max_depth: Optional[int]) -> Dict[str, Any]:
"""
Analyze AST tokens to extract structural patterns.
Args:
tokens: List of AST tokens from markdown-it
max_depth: Maximum heading depth to analyze
Returns:
Dictionary containing structural analysis
"""
analysis = {
'headings': defaultdict(list),
'paragraphs': [],
'lists': [],
'code_blocks': [],
'blockquotes': [],
'tables': [],
'links': [],
'images': [],
'emphasis': [],
'structure_types': set()
}
current_heading_level = 0
i = 0
while i < len(tokens):
token = tokens[i]
token_type = token.get('type', '')
# Track all structural types found
analysis['structure_types'].add(token_type)
# Analyze headings with depth filtering
if token_type == 'heading_open':
level = self._extract_heading_level(token.get('tag', ''))
if max_depth is None or level <= max_depth:
heading_content = self._extract_heading_content(tokens, i)
analysis['headings'][f'level_{level}'].append({
'content': heading_content,
'level': level,
'position': i
})
current_heading_level = level
# Analyze paragraphs
elif token_type == 'paragraph_open':
paragraph_content = self._extract_paragraph_content(tokens, i)
analysis['paragraphs'].append({
'content': paragraph_content,
'position': i,
'under_heading_level': current_heading_level
})
# Analyze lists
elif token_type in ['bullet_list_open', 'ordered_list_open']:
list_structure = self._extract_list_structure(tokens, i)
analysis['lists'].append({
'type': 'bullet' if token_type == 'bullet_list_open' else 'ordered',
'structure': list_structure,
'position': i,
'under_heading_level': current_heading_level
})
# Analyze code blocks
elif token_type == 'code_block' or token_type == 'fence':
code_info = self._extract_code_block_info(token)
analysis['code_blocks'].append({
'language': code_info.get('language', ''),
'content_length': len(code_info.get('content', '')),
'position': i,
'under_heading_level': current_heading_level
})
# Analyze blockquotes
elif token_type == 'blockquote_open':
quote_content = self._extract_blockquote_content(tokens, i)
analysis['blockquotes'].append({
'content': quote_content,
'position': i,
'under_heading_level': current_heading_level
})
# Analyze tables
elif token_type == 'table_open':
table_structure = self._extract_table_structure(tokens, i)
analysis['tables'].append({
'columns': table_structure.get('columns', 0),
'rows': table_structure.get('rows', 0),
'position': i,
'under_heading_level': current_heading_level
})
# Analyze inline elements
elif token_type == 'inline':
inline_analysis = self._analyze_inline_content(token)
analysis['links'].extend(inline_analysis.get('links', []))
analysis['images'].extend(inline_analysis.get('images', []))
analysis['emphasis'].extend(inline_analysis.get('emphasis', []))
i += 1
# Convert sets to lists for JSON serialization
analysis['structure_types'] = list(analysis['structure_types'])
return analysis
def _create_json_schema(
self,
analysis: Dict[str, Any],
filename: str,
mode: Optional[str] = None,
outline_depth: Optional[int] = None,
capture_heading_text: bool = False,
include_content_instructions: bool = False,
instruction_type: str = 'description'
) -> Dict[str, Any]:
"""
Create a JSON schema from structural analysis.
Args:
analysis: Structural analysis of the document
filename: Name of the source file
mode: Generation mode ('outline' for structure-focused schemas)
outline_depth: Depth limit for outline mode
capture_heading_text: Whether to capture exact heading text as constraints
include_content_instructions: Whether to include content instruction fields
instruction_type: Type of content instructions to generate
Returns:
JSON schema dictionary
"""
# Determine title format based on mode
title_preposition = "from" if mode == "outline" else "for"
schema = {
"$schema": self.default_schema_url,
"type": "object",
"title": f"Schema {title_preposition} {filename}",
"description": f"JSON schema describing the structure of {filename}",
"properties": {}
}
# Add metaschema extensions for outline mode
if mode == "outline":
schema["x-markitect-outline-mode"] = True
if outline_depth is not None:
schema["x-markitect-outline-depth"] = outline_depth
# Add metaschema extension for heading text capture
if capture_heading_text:
schema["x-markitect-heading-text-capture"] = True
# Add metaschema extension for content instructions
if include_content_instructions:
schema["x-markitect-content-instructions-enabled"] = True
# Add heading structure
if analysis['headings']:
heading_properties = {}
for level_key, headings in analysis['headings'].items():
if headings: # Only include levels that have content
# Configure content property based on heading text capture
if capture_heading_text:
# Extract actual heading texts in document order
heading_texts = [heading['content'] for heading in headings]
content_property = {"enum": heading_texts}
else:
content_property = {"type": "string"}
# Build properties for the heading item
item_properties = {
"content": content_property,
"level": {"type": "integer"},
"position": {"type": "integer"}
}
# Add content instruction fields if enabled
if include_content_instructions:
# Generate appropriate instruction text based on heading level
level_num = int(level_key.split('_')[1])
section_name = f"level {level_num} heading"
instruction_text = self._generate_content_instruction(section_name, instruction_type)
item_properties["x-markitect-content-instructions"] = {
"type": "string",
"const": instruction_text
}
item_properties["x-markitect-instruction-type"] = {
"type": "string",
"enum": [instruction_type]
}
heading_properties[level_key] = {
"type": "array",
"description": f"Headings at {level_key.replace('_', ' ')}",
"items": {
"type": "object",
"properties": item_properties,
"required": ["content", "level"]
},
"minItems": len(headings),
"maxItems": len(headings)
}
if heading_properties:
schema["properties"]["headings"] = {
"type": "object",
"description": "Document heading structure",
"properties": heading_properties
}
# Add other structural elements
structural_elements = {
"paragraphs": ("Text paragraphs", analysis['paragraphs']),
"lists": ("Lists (ordered and unordered)", analysis['lists']),
"code_blocks": ("Code blocks and fenced code", analysis['code_blocks']),
"blockquotes": ("Block quotations", analysis['blockquotes']),
"tables": ("Tables with rows and columns", analysis['tables']),
"links": ("Links to external resources", analysis['links']),
"images": ("Embedded images", analysis['images']),
"emphasis": ("Text emphasis (bold, italic)", analysis['emphasis'])
}
for element_name, (description, element_list) in structural_elements.items():
if element_list:
# Build base schema for the element
element_schema = {
"type": "array",
"description": description,
"minItems": len(element_list),
"maxItems": len(element_list)
}
# Add content instructions for paragraphs and lists if enabled
if include_content_instructions and element_name in ["paragraphs", "lists"]:
element_schema["items"] = {
"type": "object",
"properties": {
"content": {"type": "string"},
"x-markitect-content-instructions": {
"type": "string",
"const": self._generate_content_instruction(element_name, instruction_type)
},
"x-markitect-instruction-type": {
"type": "string",
"enum": [instruction_type]
}
}
}
schema["properties"][element_name] = element_schema
# Add metadata
schema["properties"]["metadata"] = {
"type": "object",
"description": "Document structure metadata",
"properties": {
"total_elements": {
"type": "integer",
"const": sum(len(v) if isinstance(v, list) else 0 for v in analysis.values())
},
"structure_types": {
"type": "array",
"items": {"type": "string"},
"description": "All structural element types found",
"const": analysis['structure_types']
}
}
}
return schema
def _extract_heading_level(self, tag: str) -> int:
"""Extract heading level from HTML tag (h1, h2, etc.)."""
if tag.startswith('h') and len(tag) == 2:
try:
return int(tag[1])
except ValueError:
pass
return 1
def _extract_heading_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
"""Extract text content from heading tokens."""
# Look for the inline token that contains the heading text
for i in range(start_index, min(start_index + 3, len(tokens))):
token = tokens[i]
if token.get('type') == 'inline':
return token.get('content', '')
return ''
def _extract_paragraph_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
"""Extract text content from paragraph tokens."""
# Look for the inline token that contains the paragraph text
for i in range(start_index, min(start_index + 3, len(tokens))):
token = tokens[i]
if token.get('type') == 'inline':
return token.get('content', '')
return ''
def _extract_list_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
"""Extract list structure information."""
# This is a simplified implementation
# In a full implementation, we'd parse the nested list structure
return {
"type": "list",
"estimated_items": 1 # Placeholder - would need more complex parsing
}
def _extract_code_block_info(self, token: Dict[str, Any]) -> Dict[str, Any]:
"""Extract code block information."""
return {
"language": token.get('info', '').split()[0] if token.get('info') else '',
"content": token.get('content', '')
}
def _extract_blockquote_content(self, tokens: List[Dict[str, Any]], start_index: int) -> str:
"""Extract blockquote content."""
# Simplified implementation
return "blockquote content"
def _extract_table_structure(self, tokens: List[Dict[str, Any]], start_index: int) -> Dict[str, Any]:
"""Extract table structure information."""
# Simplified implementation
return {
"columns": 2, # Placeholder
"rows": 1 # Placeholder
}
def _analyze_inline_content(self, token: Dict[str, Any]) -> Dict[str, List[Any]]:
"""Analyze inline content for links, images, emphasis."""
result = {
"links": [],
"images": [],
"emphasis": []
}
# Analyze children tokens if they exist
children = token.get('children', [])
for child in children:
if child and isinstance(child, dict):
child_type = child.get('type', '')
if child_type == 'link_open':
result['links'].append({"type": "link"})
elif child_type == 'image':
result['images'].append({"type": "image"})
elif child_type in ['em_open', 'strong_open']:
result['emphasis'].append({"type": child_type})
return result
def _generate_content_instruction(self, heading_text: str, instruction_type: str) -> str:
"""
Generate appropriate content instruction text based on heading and instruction type.
Args:
heading_text: The text of the heading
instruction_type: Type of instruction to generate
Returns:
Instruction text for the content field
"""
if instruction_type == "description":
return f"Provide content for the '{heading_text}' section"
elif instruction_type == "example":
return f"Example content for the '{heading_text}' section"
elif instruction_type == "constraint":
return f"Content must be relevant to '{heading_text}'"
elif instruction_type == "template":
return f"Template content for '{heading_text}' section"
else:
# Default fallback
return f"Content for the '{heading_text}' section"