markitect-main/markitect/semantic_validator.py

"""
Semantic Validator for markdown documents.

Validates markdown documents against x-markitect schema extensions:
- x-markitect-sections: Section classifications (required, recommended, etc.)
- x-markitect-content-control: Content patterns and quality metrics
- Link validation: Internal and external link checking

Complements the existing SchemaValidator which handles structural AST validation.
"""

from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
import json

from markitect.validators.section_validator import (
    SectionValidator,
    SectionValidationResult
)
from markitect.validators.content_validator import (
    ContentValidator,
    ContentValidationResult
)
from markitect.validators.link_validator import (
    LinkValidator,
    LinkValidationResult
)


class DocumentWrapper:
    """
    Wrapper for document dict to provide expected interface for validators.

    Extracts headings from AST and provides get_headings_by_level() method.
    """

    def __init__(self, doc_dict: Dict[str, Any]):
        """Initialize wrapper with document dict from DocumentManager."""
        self.doc_dict = doc_dict
        self._headings_cache = None
        self._extract_headings()

    def _extract_headings(self):
        """Extract headings from AST and cache them."""
        ast = self.doc_dict.get('ast', [])
        headings = []

        # Parse AST tokens to find headings
        # AST format: heading_open, inline (with content), heading_close
        i = 0
        while i < len(ast):
            token = ast[i]
            if isinstance(token, dict) and token.get('type') == 'heading_open':
                level_str = token.get('tag', 'h1')[1:]  # 'h2' -> '2'
                level = int(level_str) if level_str.isdigit() else 1

                # Next token should be inline with heading content
                if i + 1 < len(ast) and ast[i + 1].get('type') == 'inline':
                    content = ast[i + 1].get('content', '')
                    line_number = token.get('map', [0])[0] + 1 if token.get('map') else None

                    headings.append({
                        'content': content,
                        'level': level,
                        'line_number': line_number
                    })
            i += 1

        self._headings_cache = headings

    def get_headings_by_level(self, level: int) -> List[Dict[str, Any]]:
        """
        Get headings at specified level.

        Args:
            level: Heading level (1-6)

        Returns:
            List of heading dicts with 'content', 'level', 'line_number'
        """
        if self._headings_cache is None:
            self._extract_headings()

        return [h for h in self._headings_cache if h.get('level') == level]

    @property
    def headings(self) -> List[Dict[str, Any]]:
        """Get all headings."""
        if self._headings_cache is None:
            self._extract_headings()
        return self._headings_cache

    def __getitem__(self, key):
        """Allow dict-like access for compatibility."""
        return self.doc_dict[key]

    def get(self, key, default=None):
        """Allow dict-like get for compatibility."""
        return self.doc_dict.get(key, default)


@dataclass
class SemanticValidationReport:
    """
    Report of semantic validation results.

    Combines results from section, content, and link validators.
    """
    section_result: SectionValidationResult
    content_result: Optional[ContentValidationResult] = None
    link_result: Optional[LinkValidationResult] = None

    def has_errors(self) -> bool:
        """Check if there are any ERROR-level issues."""
        errors = self.section_result.has_errors()

        if self.content_result and hasattr(self.content_result, 'has_errors'):
            errors = errors or self.content_result.has_errors()

        if self.link_result and hasattr(self.link_result, 'has_errors'):
            errors = errors or self.link_result.has_errors()

        return errors

    def has_warnings(self) -> bool:
        """Check if there are any WARNING-level issues."""
        warnings = self.section_result.has_warnings()

        if self.content_result and hasattr(self.content_result, 'has_warnings'):
            warnings = warnings or self.content_result.has_warnings()

        if self.link_result and hasattr(self.link_result, 'has_warnings'):
            warnings = warnings or self.link_result.has_warnings()

        return warnings

    def is_valid(self) -> bool:
        """Check if validation passed (no errors)."""
        return not self.has_errors()

    def get_all_issues(self) -> List[Any]:
        """Get all issues from all validators."""
        issues = list(self.section_result.issues)

        if self.content_result and hasattr(self.content_result, 'issues'):
            issues.extend(self.content_result.issues)

        if self.link_result and hasattr(self.link_result, 'issues'):
            issues.extend(self.link_result.issues)

        return issues

    def format_text(self) -> str:
        """Format validation report as text."""
        lines = []

        # Section validation
        lines.append("Section Validation:")
        if self.section_result.issues:
            for issue in self.section_result.issues:
                status = "❌" if issue.severity == 'ERROR' else "⚠️"
                lines.append(f"  {status} {issue.section_name} - {issue.message}")
        else:
            lines.append("  ✅ All section requirements met")

        # Content validation
        if self.content_result:
            lines.append("")
            lines.append("Content Validation:")
            if self.content_result.issues:
                for issue in self.content_result.issues:
                    status = "❌" if issue.severity == 'ERROR' else "⚠️"
                    lines.append(f"  {status} {issue.section_name} - {issue.message}")
            else:
                lines.append("  ✅ All content requirements met")

        # Link validation
        if self.link_result:
            lines.append("")
            lines.append("Link Validation:")
            if self.link_result.issues:
                for issue in self.link_result.issues:
                    status = "❌" if issue.severity == 'ERROR' else "⚠️"
                    lines.append(f"  {status} {issue.link} - {issue.message}")
            else:
                lines.append(f"  ✅ All {self.link_result.links_checked} links valid")

        # Summary
        lines.append("")
        lines.append("Summary:")
        lines.append(f"  Sections checked: {self.section_result.sections_checked}")
        lines.append(f"  Sections found: {self.section_result.sections_found}")

        all_errors = self.section_result.get_errors()
        all_warnings = self.section_result.get_warnings()

        if self.content_result:
            all_errors.extend(self.content_result.get_errors())
            all_warnings.extend(self.content_result.get_warnings())

        if self.link_result:
            all_errors.extend(self.link_result.get_errors())
            all_warnings.extend(self.link_result.get_warnings())

        lines.append(f"  Errors: {len(all_errors)}")
        lines.append(f"  Warnings: {len(all_warnings)}")

        if self.is_valid():
            lines.append("  Status: PASSED ✅")
        else:
            lines.append("  Status: FAILED ❌")

        return "\n".join(lines)


class SemanticValidator:
    """
    Validates markdown documents against x-markitect extensions.

    Complements existing SchemaValidator which handles structural AST validation.
    This validator checks semantic aspects defined in x-markitect-* extensions.

    Example:
        >>> schema = load_schema('manpage-schema-v1.0.md')
        >>> validator = SemanticValidator(schema)
        >>> report = validator.validate('my-command.1.md')
        >>> if not report.is_valid():
        ...     print(report.format_text())
    """

    def __init__(self, schema: Dict[str, Any]):
        """
        Initialize semantic validator with a schema.

        Args:
            schema: JSON schema with x-markitect-* extensions

        The schema can be either:
        - A dict loaded from JSON
        - A dict loaded from markdown with embedded JSON
        - Must contain x-markitect-sections and/or x-markitect-content-control
        """
        self.schema = schema

        # Initialize sub-validators
        self.section_validator = SectionValidator(schema)
        self.content_validator = ContentValidator(schema)
        self.link_validator = LinkValidator(schema)

    def validate(self, document_path: str | Path,
                 check_links: bool = False) -> SemanticValidationReport:
        """
        Validate a markdown document against schema extensions.

        Args:
            document_path: Path to markdown document to validate
            check_links: Whether to validate links (may be slow)

        Returns:
            SemanticValidationReport with validation results

        Raises:
            FileNotFoundError: If document_path doesn't exist
            ValueError: If document cannot be parsed
        """
        document_path = Path(document_path)

        if not document_path.exists():
            raise FileNotFoundError(f"Document not found: {document_path}")

        # Parse document
        document = self._parse_document(document_path)

        # Run section validation
        section_result = self.section_validator.check(document)

        # Run content validation
        content_result = self.content_validator.check(document)

        # Run link validation (if enabled)
        if check_links:
            link_result = self.link_validator.check(document, check_external=True)
        else:
            # Still check internal links by default (fast)
            link_result = self.link_validator.check(document, check_external=False)

        return SemanticValidationReport(
            section_result=section_result,
            content_result=content_result,
            link_result=link_result
        )

    def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
        """
        Parse markdown document into AST.

        Args:
            document_path: Path to markdown file

        Returns:
            Parsed MarkdownDocument object

        This uses the existing markitect markdown parser.
        """
        # Import here to avoid circular dependency
        from markitect.document_manager import DocumentManager

        # Use DocumentManager to parse the document
        doc_manager = DocumentManager()
        doc = doc_manager.ingest_file(document_path)

        # Wrap in DocumentWrapper to provide expected interface
        return DocumentWrapper(doc)


def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
    """
    Load a schema from file (supports .json and .md formats).

    Args:
        schema_path: Path to schema file

    Returns:
        Schema dict with embedded JSON

    Raises:
        FileNotFoundError: If schema file doesn't exist
        ValueError: If schema cannot be parsed
    """
    schema_path = Path(schema_path)

    if not schema_path.exists():
        raise FileNotFoundError(f"Schema not found: {schema_path}")

    if schema_path.suffix == '.json':
        # Load JSON schema directly
        with open(schema_path, 'r', encoding='utf-8') as f:
            return json.load(f)

    elif schema_path.suffix == '.md':
        # Load markdown schema with embedded JSON
        from markitect.schema_loader import MarkdownSchemaLoader

        loader = MarkdownSchemaLoader()
        schema_data = loader.load_schema(schema_path)

        return schema_data['schema']

    else:
        raise ValueError(f"Unsupported schema format: {schema_path.suffix}")