""" Schema Validator for Issue #7: Validate a Markdown File Against a Schema. This module provides functionality to validate markdown documents against JSON schemas for arc42 architecture documentation compliance checking - essential for intelligent document analysis and plan-actual comparison capabilities. """ import json from pathlib import Path from typing import Dict, Any try: import jsonschema from jsonschema import SchemaError JSONSCHEMA_AVAILABLE = True except ImportError: # Fallback to basic validation without full JSON Schema validation JSONSCHEMA_AVAILABLE = False SchemaError = Exception from .parser import parse_markdown_to_ast from .schema_generator import SchemaGenerator from .validation_error import ValidationErrorCollector, ValidationErrorType from .exceptions import FileNotFoundError, SchemaValidationError, InvalidSchemaError class SchemaValidator: """ Validates markdown documents against JSON schemas for arc42 compliance checking. This service provides boolean validation results for markdown documents against schemas, enabling strict compliance checking for architectural documentation templates and intelligent plan-actual comparison. """ def __init__(self): """Initialize the schema validator.""" self.schema_generator = SchemaGenerator() self.jsonschema_available = JSONSCHEMA_AVAILABLE def validate_file_against_schema(self, file_path: Path, schema: Dict[str, Any]) -> bool: """ Validate a markdown file against a JSON schema. Args: file_path: Path to the markdown file schema: JSON schema dictionary to validate against Returns: True if the document matches the schema, False otherwise Raises: FileNotFoundError: If the markdown file doesn't exist InvalidSchemaError: If the schema is invalid """ # Validate inputs if not file_path.exists(): raise FileNotFoundError(f"Markdown file not found: {file_path}") # Validate the schema itself self._validate_schema(schema) # Generate the document's current structure try: document_schema = self.schema_generator.generate_schema_from_file(file_path) except Exception as e: raise SchemaValidationError(f"Failed to generate document schema: {e}") from e # Check if the expected schema has heading text constraints if self._has_heading_text_constraints(schema): # For heading text validation, we need to extract actual content and compare against enum constraints return self._validate_with_heading_text_constraints(file_path, schema, document_schema) # Use standard structure comparison for backward compatibility return self._compare_structures(document_schema, schema) def validate_file_against_schema_string(self, file_path: Path, schema_json: str) -> bool: """ Validate a markdown file against a JSON schema provided as a string. Args: file_path: Path to the markdown file schema_json: JSON schema as a string Returns: True if the document matches the schema, False otherwise Raises: FileNotFoundError: If the markdown file doesn't exist InvalidSchemaError: If the schema is invalid JSON or schema """ try: schema = json.loads(schema_json) except json.JSONDecodeError as e: raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e return self.validate_file_against_schema(file_path, schema) def validate_file_against_schema_file(self, file_path: Path, schema_file_path: Path) -> bool: """ Validate a markdown file against a schema stored in a file. Args: file_path: Path to the markdown file schema_file_path: Path to the JSON schema file Returns: True if the document matches the schema, False otherwise Raises: FileNotFoundError: If either file doesn't exist InvalidSchemaError: If the schema file is invalid """ if not schema_file_path.exists(): raise FileNotFoundError(f"Schema file not found: {schema_file_path}") try: schema_content = schema_file_path.read_text(encoding='utf-8') schema = json.loads(schema_content) except (IOError, json.JSONDecodeError) as e: raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e return self.validate_file_against_schema(file_path, schema) def _validate_schema(self, schema: Dict[str, Any]) -> None: """ Validate that a schema is a valid JSON Schema. Args: schema: Schema dictionary to validate Raises: InvalidSchemaError: If the schema is invalid """ try: # Check basic schema structure if not isinstance(schema, dict): raise InvalidSchemaError("Schema must be a dictionary") # Basic schema validation if not schema.get('$schema') or not schema.get('type'): raise InvalidSchemaError("Schema must have '$schema' and 'type' fields") # If jsonschema library is available, use it for full validation if self.jsonschema_available: jsonschema.validators.validator_for(schema).check_schema(schema) except (SchemaError, TypeError, AttributeError) as e: raise InvalidSchemaError(f"Invalid JSON schema: {e}") from e def _compare_structures(self, document_schema: Dict[str, Any], expected_schema: Dict[str, Any]) -> bool: """ Compare a document's actual structure against expected schema requirements. This method performs the core validation logic by analyzing whether the document's generated schema satisfies the requirements defined in the expected schema. Args: document_schema: Schema generated from the actual document expected_schema: Expected schema requirements Returns: True if the document satisfies the expected schema requirements """ try: # Extract actual document structure doc_properties = document_schema.get('properties', {}) expected_properties = expected_schema.get('properties', {}) # Check all required properties are present required_properties = expected_schema.get('required', []) for prop in required_properties: if prop not in doc_properties: return False # Validate heading structure if specified if 'headings' in expected_properties and 'headings' in doc_properties: if not self._validate_heading_structure( doc_properties['headings'], expected_properties['headings'] ): return False # Validate other structural elements structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables'] for element in structural_elements: if element in expected_properties: if not self._validate_structural_element( doc_properties.get(element), expected_properties[element] ): return False return True except Exception: # If comparison fails for any reason, consider validation failed return False def _validate_heading_structure(self, actual_headings: Dict[str, Any], expected_headings: Dict[str, Any]) -> bool: """ Validate heading structure against expected requirements. Args: actual_headings: Actual heading structure from document expected_headings: Expected heading requirements Returns: True if heading structure meets requirements """ actual_heading_props = actual_headings.get('properties', {}) expected_heading_props = expected_headings.get('properties', {}) required_heading_levels = expected_headings.get('required', []) # Check required heading levels are present for level in required_heading_levels: if level not in actual_heading_props: return False # Check each expected heading level meets requirements for level, expected_spec in expected_heading_props.items(): if level not in actual_heading_props: # If level is not required, skip it if level not in required_heading_levels: continue return False actual_spec = actual_heading_props[level] # Check minimum and maximum item requirements if not self._validate_array_constraints(actual_spec, expected_spec): return False return True def _validate_structural_element(self, actual_element: Dict[str, Any], expected_element: Dict[str, Any]) -> bool: """ Validate a structural element (paragraphs, lists, etc.) against requirements. Args: actual_element: Actual element structure from document expected_element: Expected element requirements Returns: True if element meets requirements """ if actual_element is None: # Element doesn't exist in document return False return self._validate_array_constraints(actual_element, expected_element) def _validate_array_constraints(self, actual: Dict[str, Any], expected: Dict[str, Any]) -> bool: """ Validate array constraints (minItems, maxItems) for structural elements. Args: actual: Actual element specification expected: Expected element specification Returns: True if constraints are satisfied """ # Get actual count from the schema specification # For generated schemas, we use minItems/maxItems which represent actual counts actual_min = actual.get('minItems', 0) actual_max = actual.get('maxItems', actual_min) actual_count = actual_max # In our generated schemas, min=max=actual count # Check against expected constraints expected_min = expected.get('minItems', 0) expected_max = expected.get('maxItems', float('inf')) return expected_min <= actual_count <= expected_max # Issue #8: Detailed Error Reporting Methods def validate_file_with_errors(self, file_path: Path, schema: Dict[str, Any]) -> ValidationErrorCollector: """ Validate a markdown file against a JSON schema and collect detailed errors. This method provides comprehensive error reporting for Issue #8, enabling users to understand exactly how their documents deviate from schemas. Args: file_path: Path to the markdown file schema: JSON schema dictionary to validate against Returns: ValidationErrorCollector with all validation errors Raises: FileNotFoundError: If the markdown file doesn't exist InvalidSchemaError: If the schema is invalid """ # Validate inputs if not file_path.exists(): raise FileNotFoundError(f"Markdown file not found: {file_path}") # Validate the schema itself self._validate_schema(schema) # Initialize error collector error_collector = ValidationErrorCollector() # Generate the document's current structure try: document_schema = self.schema_generator.generate_schema_from_file(file_path) except Exception as e: error_collector.add_error( ValidationErrorType.STRUCTURAL_VIOLATION, f"Failed to generate document schema: {e}", "document.structure", suggestion="Check if the markdown file is properly formatted" ) return error_collector # Compare the document's structure against the expected schema and collect errors if self._has_heading_text_constraints(schema): # For heading text validation, we need to handle enum constraints specially self._compare_structures_with_errors(document_schema, schema, error_collector) self._validate_heading_text_constraints_with_errors(file_path, schema, error_collector) else: # Use standard structure comparison for backward compatibility self._compare_structures_with_errors(document_schema, schema, error_collector) return error_collector def validate_file_with_errors_string(self, file_path: Path, schema_json: str) -> ValidationErrorCollector: """ Validate a markdown file against a JSON schema string and collect detailed errors. Args: file_path: Path to the markdown file schema_json: JSON schema as a string Returns: ValidationErrorCollector with all validation errors Raises: FileNotFoundError: If the markdown file doesn't exist InvalidSchemaError: If the schema is invalid JSON or schema """ try: schema = json.loads(schema_json) except json.JSONDecodeError as e: raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e return self.validate_file_with_errors(file_path, schema) def validate_file_with_errors_file(self, file_path: Path, schema_file_path: Path) -> ValidationErrorCollector: """ Validate a markdown file against a schema file and collect detailed errors. Args: file_path: Path to the markdown file schema_file_path: Path to the JSON schema file Returns: ValidationErrorCollector with all validation errors Raises: FileNotFoundError: If either file doesn't exist InvalidSchemaError: If the schema file is invalid """ if not schema_file_path.exists(): raise FileNotFoundError(f"Schema file not found: {schema_file_path}") try: schema_content = schema_file_path.read_text(encoding='utf-8') schema = json.loads(schema_content) except (IOError, json.JSONDecodeError) as e: raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e return self.validate_file_with_errors(file_path, schema) def _compare_structures_with_errors( self, document_schema: Dict[str, Any], expected_schema: Dict[str, Any], error_collector: ValidationErrorCollector ) -> None: """ Compare document structure against expected schema and collect detailed errors. This method performs comprehensive validation analysis, collecting specific errors about missing headings, incorrect content counts, and structural violations. Args: document_schema: Schema generated from the actual document expected_schema: Expected schema requirements error_collector: Collector to accumulate validation errors """ try: # Extract actual document structure doc_properties = document_schema.get('properties', {}) expected_properties = expected_schema.get('properties', {}) # Check all required properties are present required_properties = expected_schema.get('required', []) for prop in required_properties: if prop not in doc_properties: error_collector.add_error( ValidationErrorType.MISSING_REQUIRED_SECTION, f"Missing required section: '{prop}'", f"document.{prop}", expected=f"Section '{prop}' is required by schema", actual="Section not found", suggestion=f"Add the '{prop}' section to your document" ) # Validate heading structure if specified if 'headings' in expected_properties and 'headings' in doc_properties: self._validate_heading_structure_with_errors( doc_properties['headings'], expected_properties['headings'], error_collector ) # Validate other structural elements structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables'] for element in structural_elements: if element in expected_properties: self._validate_structural_element_with_errors( doc_properties.get(element), expected_properties[element], element, error_collector ) except Exception as e: error_collector.add_error( ValidationErrorType.STRUCTURAL_VIOLATION, f"Error during structure comparison: {e}", "document.structure", suggestion="Check if both the document and schema are properly formatted" ) def _validate_heading_structure_with_errors( self, actual_headings: Dict[str, Any], expected_headings: Dict[str, Any], error_collector: ValidationErrorCollector ) -> None: """ Validate heading structure and collect detailed errors. Args: actual_headings: Actual heading structure from document expected_headings: Expected heading requirements error_collector: Collector for validation errors """ actual_heading_props = actual_headings.get('properties', {}) expected_heading_props = expected_headings.get('properties', {}) required_heading_levels = expected_headings.get('required', []) # Check required heading levels are present for level in required_heading_levels: if level not in actual_heading_props: level_num = level.replace('level_', '') error_collector.add_error( ValidationErrorType.MISSING_REQUIRED_HEADING, f"Missing required heading level {level_num}", f"headings.{level}", expected=f"At least one heading at level {level_num}", actual="No headings found at this level", suggestion=f"Add heading(s) at level {level_num} (e.g., {'#' * int(level_num)} Heading)" ) # Check each expected heading level meets requirements for level, expected_spec in expected_heading_props.items(): if level not in actual_heading_props: # If level is not required, skip it if level not in required_heading_levels: continue # Already handled above in required check else: actual_spec = actual_heading_props[level] level_num = level.replace('level_', '') # Check minimum and maximum item requirements self._validate_array_constraints_with_errors( actual_spec, expected_spec, f"headings.{level}", f"level {level_num} headings", error_collector ) def _validate_structural_element_with_errors( self, actual_element: Dict[str, Any], expected_element: Dict[str, Any], element_name: str, error_collector: ValidationErrorCollector ) -> None: """ Validate a structural element and collect errors. Args: actual_element: Actual element structure from document expected_element: Expected element requirements element_name: Name of the structural element (for error messages) error_collector: Collector for validation errors """ if actual_element is None: error_collector.add_error( ValidationErrorType.MISSING_REQUIRED_SECTION, f"Missing required structural element: {element_name}", f"content.{element_name}", expected=f"Document should contain {element_name}", actual="Element not found", suggestion=f"Add {element_name} to your document" ) return self._validate_array_constraints_with_errors( actual_element, expected_element, f"content.{element_name}", element_name, error_collector ) def _validate_array_constraints_with_errors( self, actual: Dict[str, Any], expected: Dict[str, Any], path: str, element_description: str, error_collector: ValidationErrorCollector ) -> None: """ Validate array constraints and collect specific errors. Args: actual: Actual element specification expected: Expected element specification path: JSON path for error location element_description: Human-readable element description error_collector: Collector for validation errors """ # Get actual count from the schema specification actual_min = actual.get('minItems', 0) actual_max = actual.get('maxItems', actual_min) actual_count = actual_max # In our generated schemas, min=max=actual count # Check against expected constraints expected_min = expected.get('minItems', 0) expected_max = expected.get('maxItems', float('inf')) # Check minimum constraint if actual_count < expected_min: error_collector.add_error( ValidationErrorType.INSUFFICIENT_CONTENT, f"Insufficient {element_description}: found {actual_count}, required at least {expected_min}", path, expected=f"At least {expected_min} {element_description}", actual=f"{actual_count} {element_description}", suggestion=f"Add {expected_min - actual_count} more {element_description}" ) # Check maximum constraint if expected_max != float('inf') and actual_count > expected_max: error_collector.add_error( ValidationErrorType.EXCESS_CONTENT, f"Too many {element_description}: found {actual_count}, maximum allowed {expected_max}", path, expected=f"At most {expected_max} {element_description}", actual=f"{actual_count} {element_description}", suggestion=f"Remove {actual_count - expected_max} {element_description}" ) def _has_heading_text_constraints(self, schema: Dict[str, Any]) -> bool: """ Check if the schema has heading text constraints (enum values on heading content). Args: schema: JSON schema to check Returns: True if schema has heading text constraints """ headings_props = schema.get('properties', {}).get('headings', {}).get('properties', {}) for level_props in headings_props.values(): items = level_props.get('items', {}) content_prop = items.get('properties', {}).get('content', {}) if 'enum' in content_prop: return True return False def _validate_with_heading_text_constraints( self, file_path: Path, expected_schema: Dict[str, Any], document_schema: Dict[str, Any] ) -> bool: """ Validate document with heading text constraints by comparing actual content against enum values. Args: file_path: Path to the markdown file expected_schema: Schema with heading text constraints document_schema: Generated schema from the actual document Returns: True if document meets all constraints including heading text """ # First check standard structure compliance if not self._compare_structures(document_schema, expected_schema): return False # Then check heading text constraints expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {}) # Generate document analysis with actual heading content content = file_path.read_text(encoding='utf-8') ast_tokens = parse_markdown_to_ast(content) structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None) for level_key, expected_level_spec in expected_headings.items(): content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {}) if 'enum' in content_constraints: allowed_texts = content_constraints['enum'] actual_headings = structure_analysis['headings'].get(level_key, []) for heading in actual_headings: actual_text = heading['content'] if actual_text not in allowed_texts: return False return True def _validate_heading_text_constraints_with_errors( self, file_path: Path, expected_schema: Dict[str, Any], error_collector: ValidationErrorCollector ) -> None: """ Validate heading text constraints and collect detailed errors. Args: file_path: Path to the markdown file expected_schema: Schema with heading text constraints error_collector: Collector for validation errors """ expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {}) # Generate document analysis with actual heading content content = file_path.read_text(encoding='utf-8') ast_tokens = parse_markdown_to_ast(content) structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None) for level_key, expected_level_spec in expected_headings.items(): content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {}) if 'enum' in content_constraints: allowed_texts = content_constraints['enum'] actual_headings = structure_analysis['headings'].get(level_key, []) for i, heading in enumerate(actual_headings): actual_text = heading['content'] if actual_text not in allowed_texts: # Add detailed error about heading text mismatch error_collector.add_error( ValidationErrorType.HEADING_COUNT_MISMATCH, f"Heading text mismatch at {level_key.replace('_', ' ')} #{i+1}: expected one of {allowed_texts}, found '{actual_text}'", f"headings.{level_key}[{i}].content", expected=f"One of: {allowed_texts}", actual=actual_text, suggestion=f"Change heading text to one of the allowed values: {', '.join(allowed_texts)}" )