markitect-main/markitect/validators/content_validator.py

"""
Content Validator for markdown documents.

Validates content against x-markitect-content-control rules:
- Required patterns: Regex patterns that must appear in content
- Discouraged patterns: Patterns that should be avoided (warnings)
- Forbidden patterns: Patterns that must not appear (errors)
- Quality metrics: Word counts, sentence counts, readability
"""

from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import re


@dataclass
class ContentIssue:
    """Base class for content validation issues."""
    section_name: str
    severity: str  # 'ERROR', 'WARNING', 'INFO'
    message: str
    line_number: Optional[int] = None
    matched_text: Optional[str] = None

    def __str__(self) -> str:
        location = f" (line {self.line_number})" if self.line_number else ""
        match_info = f": '{self.matched_text}'" if self.matched_text else ""
        return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"


@dataclass
class PatternMissing(ContentIssue):
    """Required pattern not found in content."""
    pattern: str = ""


@dataclass
class ForbiddenPattern(ContentIssue):
    """Forbidden pattern found in content."""
    pattern: str = ""


@dataclass
class DiscouragedPattern(ContentIssue):
    """Discouraged pattern found in content."""
    pattern: str = ""


@dataclass
class ContentTooShort(ContentIssue):
    """Content does not meet minimum word/sentence count."""
    actual: int = 0
    required: int = 0


@dataclass
class ContentTooLong(ContentIssue):
    """Content exceeds maximum word/sentence count."""
    actual: int = 0
    limit: int = 0


@dataclass
class ContentValidationResult:
    """Result of content validation."""
    issues: List[ContentIssue]
    sections_checked: int

    def has_errors(self) -> bool:
        """Check if there are any ERROR-level issues."""
        return any(issue.severity == 'ERROR' for issue in self.issues)

    def has_warnings(self) -> bool:
        """Check if there are any WARNING-level issues."""
        return any(issue.severity == 'WARNING' for issue in self.issues)

    def is_valid(self) -> bool:
        """Check if validation passed (no errors)."""
        return not self.has_errors()

    def get_errors(self) -> List[ContentIssue]:
        """Get all ERROR-level issues."""
        return [issue for issue in self.issues if issue.severity == 'ERROR']

    def get_warnings(self) -> List[ContentIssue]:
        """Get all WARNING-level issues."""
        return [issue for issue in self.issues if issue.severity == 'WARNING']


class ContentValidator:
    """
    Validates content against x-markitect-content-control rules.

    Checks content patterns, quality metrics, and readability for each section.
    """

    def __init__(self, schema: Dict[str, Any]):
        """
        Initialize validator with a schema.

        Args:
            schema: JSON schema with x-markitect-content-control extension
        """
        self.schema = schema
        self.content_rules = schema.get('x-markitect-content-control', {})

    def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
        """
        Validate content against schema rules.

        Args:
            document: Parsed markdown document

        Returns:
            ContentValidationResult with any issues found
        """
        issues = []
        sections_checked = 0

        # Check each section that has content rules
        for section_key, rules in self.content_rules.items():
            sections_checked += 1

            # Get section from document
            section = self._get_section(document, section_key)

            if not section:
                # Section validator handles missing sections
                continue

            section_content = section.get('content', '')
            section_name = section.get('name', section_key)

            # Check required patterns
            issues.extend(self._check_required_patterns(
                section_name, section_content, rules
            ))

            # Check forbidden patterns
            issues.extend(self._check_forbidden_patterns(
                section_name, section_content, rules
            ))

            # Check discouraged patterns
            issues.extend(self._check_discouraged_patterns(
                section_name, section_content, rules
            ))

            # Check content quality metrics
            issues.extend(self._check_quality_metrics(
                section_name, section_content, rules
            ))

        return ContentValidationResult(
            issues=issues,
            sections_checked=sections_checked
        )

    def _get_section(self, document: 'MarkdownDocument',
                     section_key: str) -> Optional[Dict[str, Any]]:
        """
        Get a section from the document.

        Args:
            document: Parsed markdown document
            section_key: Section name (lowercase in rules, uppercase in document)

        Returns:
            Section dict with name and content, or None if not found
        """
        # Convert section_key to uppercase for matching
        section_name = section_key.upper()

        # Try to get section content
        if hasattr(document, 'get_section'):
            return document.get_section(section_name)

        # Fallback: search headings
        if hasattr(document, 'get_headings_by_level'):
            headings = document.get_headings_by_level(2)
            for heading in headings:
                if isinstance(heading, dict):
                    if heading.get('content', '').strip().upper() == section_name:
                        # Found the section, need to extract content
                        return {
                            'name': section_name,
                            'content': heading.get('text_content', '')
                        }

        return None

    def _check_required_patterns(self, section_name: str, content: str,
                                rules: Dict[str, Any]) -> List[ContentIssue]:
        """Check that all required patterns appear in content."""
        issues = []
        required_patterns = rules.get('required_patterns', [])

        for pattern in required_patterns:
            try:
                if not re.search(pattern, content, re.MULTILINE):
                    issues.append(PatternMissing(
                        section_name=section_name,
                        severity='ERROR',
                        message=f'Required pattern not found',
                        pattern=pattern
                    ))
            except re.error as e:
                # Invalid regex pattern in schema
                issues.append(ContentIssue(
                    section_name=section_name,
                    severity='ERROR',
                    message=f'Invalid regex pattern in schema: {e}'
                ))

        return issues

    def _check_forbidden_patterns(self, section_name: str, content: str,
                                  rules: Dict[str, Any]) -> List[ContentIssue]:
        """Check that no forbidden patterns appear in content."""
        issues = []
        forbidden_patterns = rules.get('forbidden_patterns', [])

        for pattern in forbidden_patterns:
            try:
                match = re.search(pattern, content, re.MULTILINE)
                if match:
                    issues.append(ForbiddenPattern(
                        section_name=section_name,
                        severity='ERROR',
                        message=f'Forbidden pattern found',
                        pattern=pattern,
                        matched_text=match.group(0)[:50]  # Limit to 50 chars
                    ))
            except re.error as e:
                issues.append(ContentIssue(
                    section_name=section_name,
                    severity='ERROR',
                    message=f'Invalid regex pattern in schema: {e}'
                ))

        return issues

    def _check_discouraged_patterns(self, section_name: str, content: str,
                                   rules: Dict[str, Any]) -> List[ContentIssue]:
        """Check for discouraged patterns (warnings)."""
        issues = []
        discouraged_patterns = rules.get('discouraged_patterns', [])

        for pattern in discouraged_patterns:
            try:
                match = re.search(pattern, content, re.MULTILINE)
                if match:
                    issues.append(DiscouragedPattern(
                        section_name=section_name,
                        severity='WARNING',
                        message=f'Discouraged pattern found',
                        pattern=pattern,
                        matched_text=match.group(0)[:50]
                    ))
            except re.error as e:
                issues.append(ContentIssue(
                    section_name=section_name,
                    severity='WARNING',
                    message=f'Invalid regex pattern in schema: {e}'
                ))

        return issues

    def _check_quality_metrics(self, section_name: str, content: str,
                              rules: Dict[str, Any]) -> List[ContentIssue]:
        """Check content quality metrics (word count, sentence count)."""
        issues = []
        quality = rules.get('content_quality', {})

        if not quality:
            return issues

        # Word count validation
        word_count = len(content.split())

        min_words = quality.get('min_words')
        if min_words is not None and word_count < min_words:
            issues.append(ContentTooShort(
                section_name=section_name,
                severity='WARNING',
                message=f'Content too short ({word_count} words, minimum {min_words})',
                actual=word_count,
                required=min_words
            ))

        max_words = quality.get('max_words')
        if max_words is not None and word_count > max_words:
            issues.append(ContentTooLong(
                section_name=section_name,
                severity='WARNING',
                message=f'Content too long ({word_count} words, maximum {max_words})',
                actual=word_count,
                limit=max_words
            ))

        # Sentence count validation
        min_sentences = quality.get('min_sentences')
        if min_sentences is not None:
            # Simple sentence count (split by .!?)
            sentence_count = len(re.findall(r'[.!?]+', content))

            if sentence_count < min_sentences:
                issues.append(ContentTooShort(
                    section_name=section_name,
                    severity='WARNING',
                    message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
                    actual=sentence_count,
                    required=min_sentences
                ))

        return issues