markitect-main/markitect/semantic_validator.py

"""
Semantic Validator for markdown documents.

Validates markdown documents against x-markitect schema extensions:
- x-markitect-sections: Section classifications (required, recommended, etc.)
- x-markitect-content-control: Content patterns and quality metrics
- Link validation: Internal and external link checking

Complements the existing SchemaValidator which handles structural AST validation.
"""

from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
import json

from markitect.validators.section_validator import (
    SectionValidator,
    SectionValidationResult
)
from markitect.validators.content_validator import (
    ContentValidator,
    ContentValidationResult
)


@dataclass
class SemanticValidationReport:
    """
    Report of semantic validation results.

    Combines results from section, content, and link validators.
    """
    section_result: SectionValidationResult
    content_result: Optional[ContentValidationResult] = None
    link_result: Optional[Any] = None  # LinkValidationResult when implemented

    def has_errors(self) -> bool:
        """Check if there are any ERROR-level issues."""
        errors = self.section_result.has_errors()

        if self.content_result and hasattr(self.content_result, 'has_errors'):
            errors = errors or self.content_result.has_errors()

        if self.link_result and hasattr(self.link_result, 'has_errors'):
            errors = errors or self.link_result.has_errors()

        return errors

    def has_warnings(self) -> bool:
        """Check if there are any WARNING-level issues."""
        warnings = self.section_result.has_warnings()

        if self.content_result and hasattr(self.content_result, 'has_warnings'):
            warnings = warnings or self.content_result.has_warnings()

        if self.link_result and hasattr(self.link_result, 'has_warnings'):
            warnings = warnings or self.link_result.has_warnings()

        return warnings

    def is_valid(self) -> bool:
        """Check if validation passed (no errors)."""
        return not self.has_errors()

    def get_all_issues(self) -> List[Any]:
        """Get all issues from all validators."""
        issues = list(self.section_result.issues)

        if self.content_result and hasattr(self.content_result, 'issues'):
            issues.extend(self.content_result.issues)

        if self.link_result and hasattr(self.link_result, 'issues'):
            issues.extend(self.link_result.issues)

        return issues

    def format_text(self) -> str:
        """Format validation report as text."""
        lines = []

        # Section validation
        lines.append("Section Validation:")
        if self.section_result.issues:
            for issue in self.section_result.issues:
                status = "❌" if issue.severity == 'ERROR' else "⚠️"
                lines.append(f"  {status} {issue.section_name} - {issue.message}")
        else:
            lines.append("  ✅ All section requirements met")

        # Content validation
        if self.content_result:
            lines.append("")
            lines.append("Content Validation:")
            if self.content_result.issues:
                for issue in self.content_result.issues:
                    status = "❌" if issue.severity == 'ERROR' else "⚠️"
                    lines.append(f"  {status} {issue.section_name} - {issue.message}")
            else:
                lines.append("  ✅ All content requirements met")

        # Summary
        lines.append("")
        lines.append("Summary:")
        lines.append(f"  Sections checked: {self.section_result.sections_checked}")
        lines.append(f"  Sections found: {self.section_result.sections_found}")

        all_errors = self.section_result.get_errors()
        all_warnings = self.section_result.get_warnings()

        if self.content_result:
            all_errors.extend(self.content_result.get_errors())
            all_warnings.extend(self.content_result.get_warnings())

        lines.append(f"  Errors: {len(all_errors)}")
        lines.append(f"  Warnings: {len(all_warnings)}")

        if self.is_valid():
            lines.append("  Status: PASSED ✅")
        else:
            lines.append("  Status: FAILED ❌")

        return "\n".join(lines)


class SemanticValidator:
    """
    Validates markdown documents against x-markitect extensions.

    Complements existing SchemaValidator which handles structural AST validation.
    This validator checks semantic aspects defined in x-markitect-* extensions.

    Example:
        >>> schema = load_schema('manpage-schema-v1.0.md')
        >>> validator = SemanticValidator(schema)
        >>> report = validator.validate('my-command.1.md')
        >>> if not report.is_valid():
        ...     print(report.format_text())
    """

    def __init__(self, schema: Dict[str, Any]):
        """
        Initialize semantic validator with a schema.

        Args:
            schema: JSON schema with x-markitect-* extensions

        The schema can be either:
        - A dict loaded from JSON
        - A dict loaded from markdown with embedded JSON
        - Must contain x-markitect-sections and/or x-markitect-content-control
        """
        self.schema = schema

        # Initialize sub-validators
        self.section_validator = SectionValidator(schema)
        self.content_validator = ContentValidator(schema)

        # TODO: Initialize link validator when implemented
        # self.link_validator = LinkValidator(schema)

    def validate(self, document_path: str | Path,
                 check_links: bool = False) -> SemanticValidationReport:
        """
        Validate a markdown document against schema extensions.

        Args:
            document_path: Path to markdown document to validate
            check_links: Whether to validate links (may be slow)

        Returns:
            SemanticValidationReport with validation results

        Raises:
            FileNotFoundError: If document_path doesn't exist
            ValueError: If document cannot be parsed
        """
        document_path = Path(document_path)

        if not document_path.exists():
            raise FileNotFoundError(f"Document not found: {document_path}")

        # Parse document
        document = self._parse_document(document_path)

        # Run section validation
        section_result = self.section_validator.check(document)

        # Run content validation
        content_result = self.content_validator.check(document)

        # TODO: Run link validation when implemented
        # if check_links:
        #     link_result = self.link_validator.check(document)
        # else:
        #     link_result = None
        link_result = None

        return SemanticValidationReport(
            section_result=section_result,
            content_result=content_result,
            link_result=link_result
        )

    def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
        """
        Parse markdown document into AST.

        Args:
            document_path: Path to markdown file

        Returns:
            Parsed MarkdownDocument object

        This uses the existing markitect markdown parser.
        """
        # Import here to avoid circular dependency
        from markitect.document_manager import DocumentManager

        # Use DocumentManager to parse the document
        doc_manager = DocumentManager()
        doc = doc_manager.ingest_file(document_path)

        return doc


def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
    """
    Load a schema from file (supports .json and .md formats).

    Args:
        schema_path: Path to schema file

    Returns:
        Schema dict with embedded JSON

    Raises:
        FileNotFoundError: If schema file doesn't exist
        ValueError: If schema cannot be parsed
    """
    schema_path = Path(schema_path)

    if not schema_path.exists():
        raise FileNotFoundError(f"Schema not found: {schema_path}")

    if schema_path.suffix == '.json':
        # Load JSON schema directly
        with open(schema_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    elif schema_path.suffix == '.md':
        # Load markdown schema with embedded JSON
        from markitect.schema_loader import MarkdownSchemaLoader

        loader = MarkdownSchemaLoader()
        schema_data = loader.load_schema(schema_path)

        return schema_data['schema']

    else:
        raise ValueError(f"Unsupported schema format: {schema_path.suffix}")