markitect-main/markitect/schema_validator.py

"""
Schema Validator for Issue #7: Validate a Markdown File Against a Schema.

This module provides functionality to validate markdown documents against JSON schemas
for arc42 architecture documentation compliance checking - essential for intelligent
document analysis and plan-actual comparison capabilities.
"""

import json
from pathlib import Path
from typing import Dict, Any, Union

try:
    import jsonschema
    from jsonschema import validate, ValidationError, SchemaError
    JSONSCHEMA_AVAILABLE = True
except ImportError:
    # Fallback to basic validation without full JSON Schema validation
    JSONSCHEMA_AVAILABLE = False
    ValidationError = Exception
    SchemaError = Exception

from .parser import parse_markdown_to_ast
from .schema_generator import SchemaGenerator
from .validation_error import ValidationErrorCollector, ValidationErrorType
from .exceptions import FileNotFoundError, SchemaValidationError, InvalidSchemaError


class SchemaValidator:
    """
    Validates markdown documents against JSON schemas for arc42 compliance checking.

    This service provides boolean validation results for markdown documents against
    schemas, enabling strict compliance checking for architectural documentation
    templates and intelligent plan-actual comparison.
    """

    def __init__(self):
        """Initialize the schema validator."""
        self.schema_generator = SchemaGenerator()
        self.jsonschema_available = JSONSCHEMA_AVAILABLE

    def validate_file_against_schema(self, file_path: Path, schema: Dict[str, Any]) -> bool:
        """
        Validate a markdown file against a JSON schema.

        Args:
            file_path: Path to the markdown file
            schema: JSON schema dictionary to validate against

        Returns:
            True if the document matches the schema, False otherwise

        Raises:
            FileNotFoundError: If the markdown file doesn't exist
            InvalidSchemaError: If the schema is invalid
        """
        # Validate inputs
        if not file_path.exists():
            raise FileNotFoundError(f"Markdown file not found: {file_path}")

        # Validate the schema itself
        self._validate_schema(schema)

        # Generate the document's current structure
        try:
            document_schema = self.schema_generator.generate_schema_from_file(file_path)
        except Exception as e:
            raise SchemaValidationError(f"Failed to generate document schema: {e}") from e

        # Check if the expected schema has heading text constraints
        if self._has_heading_text_constraints(schema):
            # For heading text validation, we need to extract actual content and compare against enum constraints
            return self._validate_with_heading_text_constraints(file_path, schema, document_schema)
        else:
            # Use standard structure comparison for backward compatibility
            return self._compare_structures(document_schema, schema)

    def validate_file_against_schema_string(self, file_path: Path, schema_json: str) -> bool:
        """
        Validate a markdown file against a JSON schema provided as a string.

        Args:
            file_path: Path to the markdown file
            schema_json: JSON schema as a string

        Returns:
            True if the document matches the schema, False otherwise

        Raises:
            FileNotFoundError: If the markdown file doesn't exist
            InvalidSchemaError: If the schema is invalid JSON or schema
        """
        try:
            schema = json.loads(schema_json)
        except json.JSONDecodeError as e:
            raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e

        return self.validate_file_against_schema(file_path, schema)

    def validate_file_against_schema_file(self, file_path: Path, schema_file_path: Path) -> bool:
        """
        Validate a markdown file against a schema stored in a file.

        Args:
            file_path: Path to the markdown file
            schema_file_path: Path to the JSON schema file

        Returns:
            True if the document matches the schema, False otherwise

        Raises:
            FileNotFoundError: If either file doesn't exist
            InvalidSchemaError: If the schema file is invalid
        """
        if not schema_file_path.exists():
            raise FileNotFoundError(f"Schema file not found: {schema_file_path}")

        try:
            schema_content = schema_file_path.read_text(encoding='utf-8')
            schema = json.loads(schema_content)
        except (IOError, json.JSONDecodeError) as e:
            raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e

        return self.validate_file_against_schema(file_path, schema)

    def _validate_schema(self, schema: Dict[str, Any]) -> None:
        """
        Validate that a schema is a valid JSON Schema.

        Args:
            schema: Schema dictionary to validate

        Raises:
            InvalidSchemaError: If the schema is invalid
        """
        try:
            # Check basic schema structure
            if not isinstance(schema, dict):
                raise InvalidSchemaError("Schema must be a dictionary")

            # Basic schema validation
            if not schema.get('$schema') or not schema.get('type'):
                raise InvalidSchemaError("Schema must have '$schema' and 'type' fields")

            # If jsonschema library is available, use it for full validation
            if self.jsonschema_available:
                jsonschema.validators.validator_for(schema).check_schema(schema)

        except (SchemaError, TypeError, AttributeError) as e:
            raise InvalidSchemaError(f"Invalid JSON schema: {e}") from e

    def _compare_structures(self, document_schema: Dict[str, Any], expected_schema: Dict[str, Any]) -> bool:
        """
        Compare a document's actual structure against expected schema requirements.

        This method performs the core validation logic by analyzing whether the
        document's generated schema satisfies the requirements defined in the
        expected schema.

        Args:
            document_schema: Schema generated from the actual document
            expected_schema: Expected schema requirements

        Returns:
            True if the document satisfies the expected schema requirements
        """
        try:
            # Extract actual document structure
            doc_properties = document_schema.get('properties', {})
            expected_properties = expected_schema.get('properties', {})

            # Check all required properties are present
            required_properties = expected_schema.get('required', [])
            for prop in required_properties:
                if prop not in doc_properties:
                    return False

            # Validate heading structure if specified
            if 'headings' in expected_properties and 'headings' in doc_properties:
                if not self._validate_heading_structure(
                    doc_properties['headings'],
                    expected_properties['headings']
                ):
                    return False

            # Validate other structural elements
            structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables']
            for element in structural_elements:
                if element in expected_properties:
                    if not self._validate_structural_element(
                        doc_properties.get(element),
                        expected_properties[element]
                    ):
                        return False

            return True

        except Exception:
            # If comparison fails for any reason, consider validation failed
            return False

    def _validate_heading_structure(self, actual_headings: Dict[str, Any], expected_headings: Dict[str, Any]) -> bool:
        """
        Validate heading structure against expected requirements.

        Args:
            actual_headings: Actual heading structure from document
            expected_headings: Expected heading requirements

        Returns:
            True if heading structure meets requirements
        """
        actual_heading_props = actual_headings.get('properties', {})
        expected_heading_props = expected_headings.get('properties', {})
        required_heading_levels = expected_headings.get('required', [])

        # Check required heading levels are present
        for level in required_heading_levels:
            if level not in actual_heading_props:
                return False

        # Check each expected heading level meets requirements
        for level, expected_spec in expected_heading_props.items():
            if level not in actual_heading_props:
                # If level is not required, skip it
                if level not in required_heading_levels:
                    continue
                return False

            actual_spec = actual_heading_props[level]

            # Check minimum and maximum item requirements
            if not self._validate_array_constraints(actual_spec, expected_spec):
                return False

        return True

    def _validate_structural_element(self, actual_element: Dict[str, Any], expected_element: Dict[str, Any]) -> bool:
        """
        Validate a structural element (paragraphs, lists, etc.) against requirements.

        Args:
            actual_element: Actual element structure from document
            expected_element: Expected element requirements

        Returns:
            True if element meets requirements
        """
        if actual_element is None:
            # Element doesn't exist in document
            return False

        return self._validate_array_constraints(actual_element, expected_element)

    def _validate_array_constraints(self, actual: Dict[str, Any], expected: Dict[str, Any]) -> bool:
        """
        Validate array constraints (minItems, maxItems) for structural elements.

        Args:
            actual: Actual element specification
            expected: Expected element specification

        Returns:
            True if constraints are satisfied
        """
        # Get actual count from the schema specification
        # For generated schemas, we use minItems/maxItems which represent actual counts
        actual_min = actual.get('minItems', 0)
        actual_max = actual.get('maxItems', actual_min)
        actual_count = actual_max  # In our generated schemas, min=max=actual count

        # Check against expected constraints
        expected_min = expected.get('minItems', 0)
        expected_max = expected.get('maxItems', float('inf'))

        return expected_min <= actual_count <= expected_max

    # Issue #8: Detailed Error Reporting Methods

    def validate_file_with_errors(self, file_path: Path, schema: Dict[str, Any]) -> ValidationErrorCollector:
        """
        Validate a markdown file against a JSON schema and collect detailed errors.

        This method provides comprehensive error reporting for Issue #8, enabling
        users to understand exactly how their documents deviate from schemas.

        Args:
            file_path: Path to the markdown file
            schema: JSON schema dictionary to validate against

        Returns:
            ValidationErrorCollector with all validation errors

        Raises:
            FileNotFoundError: If the markdown file doesn't exist
            InvalidSchemaError: If the schema is invalid
        """
        # Validate inputs
        if not file_path.exists():
            raise FileNotFoundError(f"Markdown file not found: {file_path}")

        # Validate the schema itself
        self._validate_schema(schema)

        # Initialize error collector
        error_collector = ValidationErrorCollector()

        # Generate the document's current structure
        try:
            document_schema = self.schema_generator.generate_schema_from_file(file_path)
        except Exception as e:
            error_collector.add_error(
                ValidationErrorType.STRUCTURAL_VIOLATION,
                f"Failed to generate document schema: {e}",
                "document.structure",
                suggestion="Check if the markdown file is properly formatted"
            )
            return error_collector

        # Compare the document's structure against the expected schema and collect errors
        if self._has_heading_text_constraints(schema):
            # For heading text validation, we need to handle enum constraints specially
            self._compare_structures_with_errors(document_schema, schema, error_collector)
            self._validate_heading_text_constraints_with_errors(file_path, schema, error_collector)
        else:
            # Use standard structure comparison for backward compatibility
            self._compare_structures_with_errors(document_schema, schema, error_collector)

        return error_collector

    def validate_file_with_errors_string(self, file_path: Path, schema_json: str) -> ValidationErrorCollector:
        """
        Validate a markdown file against a JSON schema string and collect detailed errors.

        Args:
            file_path: Path to the markdown file
            schema_json: JSON schema as a string

        Returns:
            ValidationErrorCollector with all validation errors

        Raises:
            FileNotFoundError: If the markdown file doesn't exist
            InvalidSchemaError: If the schema is invalid JSON or schema
        """
        try:
            schema = json.loads(schema_json)
        except json.JSONDecodeError as e:
            raise InvalidSchemaError(f"Invalid JSON schema string: {e}") from e

        return self.validate_file_with_errors(file_path, schema)

    def validate_file_with_errors_file(self, file_path: Path, schema_file_path: Path) -> ValidationErrorCollector:
        """
        Validate a markdown file against a schema file and collect detailed errors.

        Args:
            file_path: Path to the markdown file
            schema_file_path: Path to the JSON schema file

        Returns:
            ValidationErrorCollector with all validation errors

        Raises:
            FileNotFoundError: If either file doesn't exist
            InvalidSchemaError: If the schema file is invalid
        """
        if not schema_file_path.exists():
            raise FileNotFoundError(f"Schema file not found: {schema_file_path}")

        try:
            schema_content = schema_file_path.read_text(encoding='utf-8')
            schema = json.loads(schema_content)
        except (IOError, json.JSONDecodeError) as e:
            raise InvalidSchemaError(f"Failed to load schema file {schema_file_path}: {e}") from e

        return self.validate_file_with_errors(file_path, schema)

    def _compare_structures_with_errors(
        self,
        document_schema: Dict[str, Any],
        expected_schema: Dict[str, Any],
        error_collector: ValidationErrorCollector
    ) -> None:
        """
        Compare document structure against expected schema and collect detailed errors.

        This method performs comprehensive validation analysis, collecting specific
        errors about missing headings, incorrect content counts, and structural violations.

        Args:
            document_schema: Schema generated from the actual document
            expected_schema: Expected schema requirements
            error_collector: Collector to accumulate validation errors
        """
        try:
            # Extract actual document structure
            doc_properties = document_schema.get('properties', {})
            expected_properties = expected_schema.get('properties', {})

            # Check all required properties are present
            required_properties = expected_schema.get('required', [])
            for prop in required_properties:
                if prop not in doc_properties:
                    error_collector.add_error(
                        ValidationErrorType.MISSING_REQUIRED_SECTION,
                        f"Missing required section: '{prop}'",
                        f"document.{prop}",
                        expected=f"Section '{prop}' is required by schema",
                        actual="Section not found",
                        suggestion=f"Add the '{prop}' section to your document"
                    )

            # Validate heading structure if specified
            if 'headings' in expected_properties and 'headings' in doc_properties:
                self._validate_heading_structure_with_errors(
                    doc_properties['headings'],
                    expected_properties['headings'],
                    error_collector
                )

            # Validate other structural elements
            structural_elements = ['paragraphs', 'lists', 'code_blocks', 'blockquotes', 'tables']
            for element in structural_elements:
                if element in expected_properties:
                    self._validate_structural_element_with_errors(
                        doc_properties.get(element),
                        expected_properties[element],
                        element,
                        error_collector
                    )

        except Exception as e:
            error_collector.add_error(
                ValidationErrorType.STRUCTURAL_VIOLATION,
                f"Error during structure comparison: {e}",
                "document.structure",
                suggestion="Check if both the document and schema are properly formatted"
            )

    def _validate_heading_structure_with_errors(
        self,
        actual_headings: Dict[str, Any],
        expected_headings: Dict[str, Any],
        error_collector: ValidationErrorCollector
    ) -> None:
        """
        Validate heading structure and collect detailed errors.

        Args:
            actual_headings: Actual heading structure from document
            expected_headings: Expected heading requirements
            error_collector: Collector for validation errors
        """
        actual_heading_props = actual_headings.get('properties', {})
        expected_heading_props = expected_headings.get('properties', {})
        required_heading_levels = expected_headings.get('required', [])

        # Check required heading levels are present
        for level in required_heading_levels:
            if level not in actual_heading_props:
                level_num = level.replace('level_', '')
                error_collector.add_error(
                    ValidationErrorType.MISSING_REQUIRED_HEADING,
                    f"Missing required heading level {level_num}",
                    f"headings.{level}",
                    expected=f"At least one heading at level {level_num}",
                    actual="No headings found at this level",
                    suggestion=f"Add heading(s) at level {level_num} (e.g., {'#' * int(level_num)} Heading)"
                )

        # Check each expected heading level meets requirements
        for level, expected_spec in expected_heading_props.items():
            if level not in actual_heading_props:
                # If level is not required, skip it
                if level not in required_heading_levels:
                    continue
                # Already handled above in required check

            else:
                actual_spec = actual_heading_props[level]
                level_num = level.replace('level_', '')

                # Check minimum and maximum item requirements
                self._validate_array_constraints_with_errors(
                    actual_spec,
                    expected_spec,
                    f"headings.{level}",
                    f"level {level_num} headings",
                    error_collector
                )

    def _validate_structural_element_with_errors(
        self,
        actual_element: Dict[str, Any],
        expected_element: Dict[str, Any],
        element_name: str,
        error_collector: ValidationErrorCollector
    ) -> None:
        """
        Validate a structural element and collect errors.

        Args:
            actual_element: Actual element structure from document
            expected_element: Expected element requirements
            element_name: Name of the structural element (for error messages)
            error_collector: Collector for validation errors
        """
        if actual_element is None:
            error_collector.add_error(
                ValidationErrorType.MISSING_REQUIRED_SECTION,
                f"Missing required structural element: {element_name}",
                f"content.{element_name}",
                expected=f"Document should contain {element_name}",
                actual="Element not found",
                suggestion=f"Add {element_name} to your document"
            )
            return

        self._validate_array_constraints_with_errors(
            actual_element,
            expected_element,
            f"content.{element_name}",
            element_name,
            error_collector
        )

    def _validate_array_constraints_with_errors(
        self,
        actual: Dict[str, Any],
        expected: Dict[str, Any],
        path: str,
        element_description: str,
        error_collector: ValidationErrorCollector
    ) -> None:
        """
        Validate array constraints and collect specific errors.

        Args:
            actual: Actual element specification
            expected: Expected element specification
            path: JSON path for error location
            element_description: Human-readable element description
            error_collector: Collector for validation errors
        """
        # Get actual count from the schema specification
        actual_min = actual.get('minItems', 0)
        actual_max = actual.get('maxItems', actual_min)
        actual_count = actual_max  # In our generated schemas, min=max=actual count

        # Check against expected constraints
        expected_min = expected.get('minItems', 0)
        expected_max = expected.get('maxItems', float('inf'))

        # Check minimum constraint
        if actual_count < expected_min:
            error_collector.add_error(
                ValidationErrorType.INSUFFICIENT_CONTENT,
                f"Insufficient {element_description}: found {actual_count}, required at least {expected_min}",
                path,
                expected=f"At least {expected_min} {element_description}",
                actual=f"{actual_count} {element_description}",
                suggestion=f"Add {expected_min - actual_count} more {element_description}"
            )

        # Check maximum constraint
        if expected_max != float('inf') and actual_count > expected_max:
            error_collector.add_error(
                ValidationErrorType.EXCESS_CONTENT,
                f"Too many {element_description}: found {actual_count}, maximum allowed {expected_max}",
                path,
                expected=f"At most {expected_max} {element_description}",
                actual=f"{actual_count} {element_description}",
                suggestion=f"Remove {actual_count - expected_max} {element_description}"
            )

    def _has_heading_text_constraints(self, schema: Dict[str, Any]) -> bool:
        """
        Check if the schema has heading text constraints (enum values on heading content).

        Args:
            schema: JSON schema to check

        Returns:
            True if schema has heading text constraints
        """
        headings_props = schema.get('properties', {}).get('headings', {}).get('properties', {})

        for level_props in headings_props.values():
            items = level_props.get('items', {})
            content_prop = items.get('properties', {}).get('content', {})
            if 'enum' in content_prop:
                return True

        return False

    def _validate_with_heading_text_constraints(
        self,
        file_path: Path,
        expected_schema: Dict[str, Any],
        document_schema: Dict[str, Any]
    ) -> bool:
        """
        Validate document with heading text constraints by comparing actual content against enum values.

        Args:
            file_path: Path to the markdown file
            expected_schema: Schema with heading text constraints
            document_schema: Generated schema from the actual document

        Returns:
            True if document meets all constraints including heading text
        """
        # First check standard structure compliance
        if not self._compare_structures(document_schema, expected_schema):
            return False

        # Then check heading text constraints
        expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {})

        # Generate document analysis with actual heading content
        from .parser import parse_markdown_to_ast
        content = file_path.read_text(encoding='utf-8')
        ast_tokens = parse_markdown_to_ast(content)
        structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None)

        for level_key, expected_level_spec in expected_headings.items():
            content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {})

            if 'enum' in content_constraints:
                allowed_texts = content_constraints['enum']
                actual_headings = structure_analysis['headings'].get(level_key, [])

                for heading in actual_headings:
                    actual_text = heading['content']
                    if actual_text not in allowed_texts:
                        return False

        return True

    def _validate_heading_text_constraints_with_errors(
        self,
        file_path: Path,
        expected_schema: Dict[str, Any],
        error_collector: ValidationErrorCollector
    ) -> None:
        """
        Validate heading text constraints and collect detailed errors.

        Args:
            file_path: Path to the markdown file
            expected_schema: Schema with heading text constraints
            error_collector: Collector for validation errors
        """
        expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {})

        # Generate document analysis with actual heading content
        from .parser import parse_markdown_to_ast
        content = file_path.read_text(encoding='utf-8')
        ast_tokens = parse_markdown_to_ast(content)
        structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None)

        for level_key, expected_level_spec in expected_headings.items():
            content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {})

            if 'enum' in content_constraints:
                allowed_texts = content_constraints['enum']
                actual_headings = structure_analysis['headings'].get(level_key, [])

                for i, heading in enumerate(actual_headings):
                    actual_text = heading['content']
                    if actual_text not in allowed_texts:
                        # Add detailed error about heading text mismatch
                        error_collector.add_error(
                            ValidationErrorType.HEADING_COUNT_MISMATCH,
                            f"Heading text mismatch at {level_key.replace('_', ' ')} #{i+1}: expected one of {allowed_texts}, found '{actual_text}'",
                            f"headings.{level_key}[{i}].content",
                            expected=f"One of: {allowed_texts}",
                            actual=actual_text,
                            suggestion=f"Change heading text to one of the allowed values: {', '.join(allowed_texts)}"
                        )