Files
markitect-main/markitect/schema_validator.py
tegwick 0f37900222 feat: Complete Issue #52 - Capture actual heading text in schemas
Implement comprehensive heading text capture functionality that allows schemas to
enforce specific heading text requirements through enum constraints:

• New CLI option: --capture-heading-text flag for exact text constraints
• Schema generation with heading text as enum constraints (not just structure)
• Advanced validation engine that enforces heading text requirements
• Metaschema extension: x-markitect-heading-text-capture marker
• Full integration with Issue #51 outline mode capabilities
• Comprehensive error reporting for heading text mismatches
• Complete backward compatibility with existing schema generation

Technical implementation:
- Extended SchemaGenerator with capture_heading_text parameter
- Enhanced validation system to check enum constraints on heading content
- Added _validate_heading_text_constraints_with_errors for detailed reporting
- Integrated with existing metaschema validation from Issue #50
- Preserved document order of headings in enum constraints

Key features:
- Schemas can now specify required heading text via enum constraints
- Validation rejects documents with incorrect heading text
- Detailed error messages show expected vs actual heading text
- Works seamlessly with outline mode depth controls
- Maintains 100% compatibility with 513 existing tests

Usage examples:
  markitect schema-generate --capture-heading-text document.md
  markitect schema-generate --mode outline --capture-heading-text --depth 2 document.md

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-01 08:03:11 +02:00

682 lines
27 KiB
Python

"""
Schema Validator for Issue #7: Validate a Markdown File Against a Schema.
This module provides functionality to validate markdown documents against JSON schemas
for arc42 architecture documentation compliance checking - essential for intelligent
document analysis and plan-actual comparison capabilities.
"""
import json
from pathlib import Path
from typing import Dict, Any, Union
try:
import jsonschema
from jsonschema import validate, ValidationError, SchemaError
JSONSCHEMA_AVAILABLE = True
except ImportError:
# Fallback to basic validation without full JSON Schema validation
JSONSCHEMA_AVAILABLE = False
ValidationError = Exception
SchemaError = Exception
from .parser import parse_markdown_to_ast
from .schema_generator import SchemaGenerator
from .validation_error import ValidationErrorCollector, ValidationErrorType
from .exceptions import FileNotFoundError, SchemaValidationError, InvalidSchemaError
class SchemaValidator:
"""
Validates markdown documents against JSON schemas for arc42 compliance checking.
This service provides boolean validation results for markdown documents against
schemas, enabling strict compliance checking for architectural documentation
templates and intelligent plan-actual comparison.
"""
def __init__(self):
"""Initialize the schema validator."""
self.schema_generator = SchemaGenerator()
self.jsonschema_available = JSONSCHEMA_AVAILABLE
def validate_file_against_schema(self, file_path: Path, schema: Dict[str, Any]) -> bool:
"""
Validate a markdown file against a JSON schema.
Args:
file_path: Path to the markdown file
schema: JSON schema dictionary to validate against
Returns:
True if the document matches the schema, False otherwise
Raises:
FileNotFoundError: If the markdown file doesn't exist
InvalidSchemaError: If the schema is invalid
"""
# Validate inputs
if not file_path.exists():
raise FileNotFoundError(f"Markdown file not found: {file_path}")
# Validate the schema itself
self._validate_schema(schema)
# Generate the document's current structure
try:
document_schema = self.schema_generator.generate_schema_from_file(file_path)
except Exception as e:
raise SchemaValidationError(f"Failed to generate document schema: {e}") from e
# Check if the expected schema has heading text constraints
if self._has_heading_text_constraints(schema):
# For heading text validation, we need to extract actual content and compare against enum constraints
return self._validate_with_heading_text_constraints(file_path, schema, document_schema)
else:
# Use standard structure comparison for backward compatibility
return self._compare_structures(document_schema, schema)
def validate_file_against_schema_string(self, file_path: Path, schema_json: str) -> bool:
"""
Validate a markdown file against a JSON schema provided as a string.
Args:
file_path: Path to the markdown file
schema_json: JSON schema as a string
Returns:
True if the document matches the schema, False otherwise
Raises:
FileNotFoundError: If the markdown file doesn't exist
InvalidSchemaError: If the schema is invalid JSON or schema
"""
try:
schema = json.loads(schema_json)
except json.JSONDecodeError as e:
raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e
return self.validate_file_against_schema(file_path, schema)
def validate_file_against_schema_file(self, file_path: Path, schema_file_path: Path) -> bool:
"""
Validate a markdown file against a schema stored in a file.
Args:
file_path: Path to the markdown file
schema_file_path: Path to the JSON schema file
Returns:
True if the document matches the schema, False otherwise
Raises:
FileNotFoundError: If either file doesn't exist
InvalidSchemaError: If the schema file is invalid
"""
if not schema_file_path.exists():
raise FileNotFoundError(f"Schema file not found: {schema_file_path}")
try:
schema_content = schema_file_path.read_text(encoding='utf-8')
schema = json.loads(schema_content)
except (IOError, json.JSONDecodeError) as e:
raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e
return self.validate_file_against_schema(file_path, schema)
def _validate_schema(self, schema: Dict[str, Any]) -> None:
"""
Validate that a schema is a valid JSON Schema.
Args:
schema: Schema dictionary to validate
Raises:
InvalidSchemaError: If the schema is invalid
"""
try:
# Check basic schema structure
if not isinstance(schema, dict):
raise InvalidSchemaError("Schema must be a dictionary")
# Basic schema validation
if not schema.get('$schema') or not schema.get('type'):
raise InvalidSchemaError("Schema must have '$schema' and 'type' fields")
# If jsonschema library is available, use it for full validation
if self.jsonschema_available:
jsonschema.validators.validator_for(schema).check_schema(schema)
except (SchemaError, TypeError, AttributeError) as e:
raise InvalidSchemaError(f"Invalid JSON schema: {e}") from e
def _compare_structures(self, document_schema: Dict[str, Any], expected_schema: Dict[str, Any]) -> bool:
"""
Compare a document's actual structure against expected schema requirements.
This method performs the core validation logic by analyzing whether the
document's generated schema satisfies the requirements defined in the
expected schema.
Args:
document_schema: Schema generated from the actual document
expected_schema: Expected schema requirements
Returns:
True if the document satisfies the expected schema requirements
"""
try:
# Extract actual document structure
doc_properties = document_schema.get('properties', {})
expected_properties = expected_schema.get('properties', {})
# Check all required properties are present
required_properties = expected_schema.get('required', [])
for prop in required_properties:
if prop not in doc_properties:
return False
# Validate heading structure if specified
if 'headings' in expected_properties and 'headings' in doc_properties:
if not self._validate_heading_structure(
doc_properties['headings'],
expected_properties['headings']
):
return False
# Validate other structural elements
structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables']
for element in structural_elements:
if element in expected_properties:
if not self._validate_structural_element(
doc_properties.get(element),
expected_properties[element]
):
return False
return True
except Exception:
# If comparison fails for any reason, consider validation failed
return False
def _validate_heading_structure(self, actual_headings: Dict[str, Any], expected_headings: Dict[str, Any]) -> bool:
"""
Validate heading structure against expected requirements.
Args:
actual_headings: Actual heading structure from document
expected_headings: Expected heading requirements
Returns:
True if heading structure meets requirements
"""
actual_heading_props = actual_headings.get('properties', {})
expected_heading_props = expected_headings.get('properties', {})
required_heading_levels = expected_headings.get('required', [])
# Check required heading levels are present
for level in required_heading_levels:
if level not in actual_heading_props:
return False
# Check each expected heading level meets requirements
for level, expected_spec in expected_heading_props.items():
if level not in actual_heading_props:
# If level is not required, skip it
if level not in required_heading_levels:
continue
return False
actual_spec = actual_heading_props[level]
# Check minimum and maximum item requirements
if not self._validate_array_constraints(actual_spec, expected_spec):
return False
return True
def _validate_structural_element(self, actual_element: Dict[str, Any], expected_element: Dict[str, Any]) -> bool:
"""
Validate a structural element (paragraphs, lists, etc.) against requirements.
Args:
actual_element: Actual element structure from document
expected_element: Expected element requirements
Returns:
True if element meets requirements
"""
if actual_element is None:
# Element doesn't exist in document
return False
return self._validate_array_constraints(actual_element, expected_element)
def _validate_array_constraints(self, actual: Dict[str, Any], expected: Dict[str, Any]) -> bool:
"""
Validate array constraints (minItems, maxItems) for structural elements.
Args:
actual: Actual element specification
expected: Expected element specification
Returns:
True if constraints are satisfied
"""
# Get actual count from the schema specification
# For generated schemas, we use minItems/maxItems which represent actual counts
actual_min = actual.get('minItems', 0)
actual_max = actual.get('maxItems', actual_min)
actual_count = actual_max # In our generated schemas, min=max=actual count
# Check against expected constraints
expected_min = expected.get('minItems', 0)
expected_max = expected.get('maxItems', float('inf'))
return expected_min <= actual_count <= expected_max
# Issue #8: Detailed Error Reporting Methods
def validate_file_with_errors(self, file_path: Path, schema: Dict[str, Any]) -> ValidationErrorCollector:
"""
Validate a markdown file against a JSON schema and collect detailed errors.
This method provides comprehensive error reporting for Issue #8, enabling
users to understand exactly how their documents deviate from schemas.
Args:
file_path: Path to the markdown file
schema: JSON schema dictionary to validate against
Returns:
ValidationErrorCollector with all validation errors
Raises:
FileNotFoundError: If the markdown file doesn't exist
InvalidSchemaError: If the schema is invalid
"""
# Validate inputs
if not file_path.exists():
raise FileNotFoundError(f"Markdown file not found: {file_path}")
# Validate the schema itself
self._validate_schema(schema)
# Initialize error collector
error_collector = ValidationErrorCollector()
# Generate the document's current structure
try:
document_schema = self.schema_generator.generate_schema_from_file(file_path)
except Exception as e:
error_collector.add_error(
ValidationErrorType.STRUCTURAL_VIOLATION,
f"Failed to generate document schema: {e}",
"document.structure",
suggestion="Check if the markdown file is properly formatted"
)
return error_collector
# Compare the document's structure against the expected schema and collect errors
if self._has_heading_text_constraints(schema):
# For heading text validation, we need to handle enum constraints specially
self._compare_structures_with_errors(document_schema, schema, error_collector)
self._validate_heading_text_constraints_with_errors(file_path, schema, error_collector)
else:
# Use standard structure comparison for backward compatibility
self._compare_structures_with_errors(document_schema, schema, error_collector)
return error_collector
def validate_file_with_errors_string(self, file_path: Path, schema_json: str) -> ValidationErrorCollector:
"""
Validate a markdown file against a JSON schema string and collect detailed errors.
Args:
file_path: Path to the markdown file
schema_json: JSON schema as a string
Returns:
ValidationErrorCollector with all validation errors
Raises:
FileNotFoundError: If the markdown file doesn't exist
InvalidSchemaError: If the schema is invalid JSON or schema
"""
try:
schema = json.loads(schema_json)
except json.JSONDecodeError as e:
raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e
return self.validate_file_with_errors(file_path, schema)
def validate_file_with_errors_file(self, file_path: Path, schema_file_path: Path) -> ValidationErrorCollector:
"""
Validate a markdown file against a schema file and collect detailed errors.
Args:
file_path: Path to the markdown file
schema_file_path: Path to the JSON schema file
Returns:
ValidationErrorCollector with all validation errors
Raises:
FileNotFoundError: If either file doesn't exist
InvalidSchemaError: If the schema file is invalid
"""
if not schema_file_path.exists():
raise FileNotFoundError(f"Schema file not found: {schema_file_path}")
try:
schema_content = schema_file_path.read_text(encoding='utf-8')
schema = json.loads(schema_content)
except (IOError, json.JSONDecodeError) as e:
raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e
return self.validate_file_with_errors(file_path, schema)
def _compare_structures_with_errors(
self,
document_schema: Dict[str, Any],
expected_schema: Dict[str, Any],
error_collector: ValidationErrorCollector
) -> None:
"""
Compare document structure against expected schema and collect detailed errors.
This method performs comprehensive validation analysis, collecting specific
errors about missing headings, incorrect content counts, and structural violations.
Args:
document_schema: Schema generated from the actual document
expected_schema: Expected schema requirements
error_collector: Collector to accumulate validation errors
"""
try:
# Extract actual document structure
doc_properties = document_schema.get('properties', {})
expected_properties = expected_schema.get('properties', {})
# Check all required properties are present
required_properties = expected_schema.get('required', [])
for prop in required_properties:
if prop not in doc_properties:
error_collector.add_error(
ValidationErrorType.MISSING_REQUIRED_SECTION,
f"Missing required section: '{prop}'",
f"document.{prop}",
expected=f"Section '{prop}' is required by schema",
actual="Section not found",
suggestion=f"Add the '{prop}' section to your document"
)
# Validate heading structure if specified
if 'headings' in expected_properties and 'headings' in doc_properties:
self._validate_heading_structure_with_errors(
doc_properties['headings'],
expected_properties['headings'],
error_collector
)
# Validate other structural elements
structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables']
for element in structural_elements:
if element in expected_properties:
self._validate_structural_element_with_errors(
doc_properties.get(element),
expected_properties[element],
element,
error_collector
)
except Exception as e:
error_collector.add_error(
ValidationErrorType.STRUCTURAL_VIOLATION,
f"Error during structure comparison: {e}",
"document.structure",
suggestion="Check if both the document and schema are properly formatted"
)
def _validate_heading_structure_with_errors(
self,
actual_headings: Dict[str, Any],
expected_headings: Dict[str, Any],
error_collector: ValidationErrorCollector
) -> None:
"""
Validate heading structure and collect detailed errors.
Args:
actual_headings: Actual heading structure from document
expected_headings: Expected heading requirements
error_collector: Collector for validation errors
"""
actual_heading_props = actual_headings.get('properties', {})
expected_heading_props = expected_headings.get('properties', {})
required_heading_levels = expected_headings.get('required', [])
# Check required heading levels are present
for level in required_heading_levels:
if level not in actual_heading_props:
level_num = level.replace('level_', '')
error_collector.add_error(
ValidationErrorType.MISSING_REQUIRED_HEADING,
f"Missing required heading level {level_num}",
f"headings.{level}",
expected=f"At least one heading at level {level_num}",
actual="No headings found at this level",
suggestion=f"Add heading(s) at level {level_num} (e.g., {'#' * int(level_num)} Heading)"
)
# Check each expected heading level meets requirements
for level, expected_spec in expected_heading_props.items():
if level not in actual_heading_props:
# If level is not required, skip it
if level not in required_heading_levels:
continue
# Already handled above in required check
else:
actual_spec = actual_heading_props[level]
level_num = level.replace('level_', '')
# Check minimum and maximum item requirements
self._validate_array_constraints_with_errors(
actual_spec,
expected_spec,
f"headings.{level}",
f"level {level_num} headings",
error_collector
)
def _validate_structural_element_with_errors(
self,
actual_element: Dict[str, Any],
expected_element: Dict[str, Any],
element_name: str,
error_collector: ValidationErrorCollector
) -> None:
"""
Validate a structural element and collect errors.
Args:
actual_element: Actual element structure from document
expected_element: Expected element requirements
element_name: Name of the structural element (for error messages)
error_collector: Collector for validation errors
"""
if actual_element is None:
error_collector.add_error(
ValidationErrorType.MISSING_REQUIRED_SECTION,
f"Missing required structural element: {element_name}",
f"content.{element_name}",
expected=f"Document should contain {element_name}",
actual="Element not found",
suggestion=f"Add {element_name} to your document"
)
return
self._validate_array_constraints_with_errors(
actual_element,
expected_element,
f"content.{element_name}",
element_name,
error_collector
)
def _validate_array_constraints_with_errors(
self,
actual: Dict[str, Any],
expected: Dict[str, Any],
path: str,
element_description: str,
error_collector: ValidationErrorCollector
) -> None:
"""
Validate array constraints and collect specific errors.
Args:
actual: Actual element specification
expected: Expected element specification
path: JSON path for error location
element_description: Human-readable element description
error_collector: Collector for validation errors
"""
# Get actual count from the schema specification
actual_min = actual.get('minItems', 0)
actual_max = actual.get('maxItems', actual_min)
actual_count = actual_max # In our generated schemas, min=max=actual count
# Check against expected constraints
expected_min = expected.get('minItems', 0)
expected_max = expected.get('maxItems', float('inf'))
# Check minimum constraint
if actual_count < expected_min:
error_collector.add_error(
ValidationErrorType.INSUFFICIENT_CONTENT,
f"Insufficient {element_description}: found {actual_count}, required at least {expected_min}",
path,
expected=f"At least {expected_min} {element_description}",
actual=f"{actual_count} {element_description}",
suggestion=f"Add {expected_min - actual_count} more {element_description}"
)
# Check maximum constraint
if expected_max != float('inf') and actual_count > expected_max:
error_collector.add_error(
ValidationErrorType.EXCESS_CONTENT,
f"Too many {element_description}: found {actual_count}, maximum allowed {expected_max}",
path,
expected=f"At most {expected_max} {element_description}",
actual=f"{actual_count} {element_description}",
suggestion=f"Remove {actual_count - expected_max} {element_description}"
)
def _has_heading_text_constraints(self, schema: Dict[str, Any]) -> bool:
"""
Check if the schema has heading text constraints (enum values on heading content).
Args:
schema: JSON schema to check
Returns:
True if schema has heading text constraints
"""
headings_props = schema.get('properties', {}).get('headings', {}).get('properties', {})
for level_props in headings_props.values():
items = level_props.get('items', {})
content_prop = items.get('properties', {}).get('content', {})
if 'enum' in content_prop:
return True
return False
def _validate_with_heading_text_constraints(
self,
file_path: Path,
expected_schema: Dict[str, Any],
document_schema: Dict[str, Any]
) -> bool:
"""
Validate document with heading text constraints by comparing actual content against enum values.
Args:
file_path: Path to the markdown file
expected_schema: Schema with heading text constraints
document_schema: Generated schema from the actual document
Returns:
True if document meets all constraints including heading text
"""
# First check standard structure compliance
if not self._compare_structures(document_schema, expected_schema):
return False
# Then check heading text constraints
expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {})
# Generate document analysis with actual heading content
from .parser import parse_markdown_to_ast
content = file_path.read_text(encoding='utf-8')
ast_tokens = parse_markdown_to_ast(content)
structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None)
for level_key, expected_level_spec in expected_headings.items():
content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {})
if 'enum' in content_constraints:
allowed_texts = content_constraints['enum']
actual_headings = structure_analysis['headings'].get(level_key, [])
for heading in actual_headings:
actual_text = heading['content']
if actual_text not in allowed_texts:
return False
return True
def _validate_heading_text_constraints_with_errors(
self,
file_path: Path,
expected_schema: Dict[str, Any],
error_collector: ValidationErrorCollector
) -> None:
"""
Validate heading text constraints and collect detailed errors.
Args:
file_path: Path to the markdown file
expected_schema: Schema with heading text constraints
error_collector: Collector for validation errors
"""
expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {})
# Generate document analysis with actual heading content
from .parser import parse_markdown_to_ast
content = file_path.read_text(encoding='utf-8')
ast_tokens = parse_markdown_to_ast(content)
structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None)
for level_key, expected_level_spec in expected_headings.items():
content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {})
if 'enum' in content_constraints:
allowed_texts = content_constraints['enum']
actual_headings = structure_analysis['headings'].get(level_key, [])
for i, heading in enumerate(actual_headings):
actual_text = heading['content']
if actual_text not in allowed_texts:
# Add detailed error about heading text mismatch
error_collector.add_error(
ValidationErrorType.HEADING_COUNT_MISMATCH,
f"Heading text mismatch at {level_key.replace('_', ' ')} #{i+1}: expected one of {allowed_texts}, found '{actual_text}'",
f"headings.{level_key}[{i}].content",
expected=f"One of: {allowed_texts}",
actual=actual_text,
suggestion=f"Change heading text to one of the allowed values: {', '.join(allowed_texts)}"
)