feat: add semantic document validator for x-markitect extensions

Implements semantic validation to complement existing structural validation: Phase 1 & 2 Complete: - SemanticValidator: Main validator orchestrating sub-validators - SectionValidator: Enforces section classifications (required, recommended, optional, discouraged, improper) from x-markitect-sections - ContentValidator: Validates content patterns, forbidden patterns, and quality metrics (word counts, sentence counts) from x-markitect-content-control Features: - Pattern matching with regex for required/forbidden/discouraged patterns - Word count and sentence count validation - Detailed error reporting with severity levels (ERROR, WARNING) - Support for section alternatives (e.g., FLAGS vs OPTIONS) - Comprehensive test coverage (16 tests, 100% passing) Architecture: - Complements existing SchemaValidator (structural AST validation) - Clean separation: validators/ package for modular validators - Semantic validation focuses on x-markitect-* extensions - LinkValidator planned for Phase 3 (optional --check-links) Next: Phase 4 - CLI integration to enhance 'markitect validate' command Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:24:32 +01:00
parent f27eea6b5b
commit a969c5de47
6 changed files with 1932 additions and 0 deletions
--- a/markitect/semantic_validator.py
+++ b/markitect/semantic_validator.py
@@ -0,0 +1,261 @@
 """
 Semantic Validator for markdown documents.
 Validates markdown documents against x-markitect schema extensions:
 - x-markitect-sections: Section classifications (required, recommended, etc.)
 - x-markitect-content-control: Content patterns and quality metrics
 - Link validation: Internal and external link checking
 Complements the existing SchemaValidator which handles structural AST validation.
 """
 from dataclasses import dataclass
 from typing import List, Dict, Any, Optional
 from pathlib import Path
 import json
 from markitect.validators.section_validator import (
    SectionValidator,
    SectionValidationResult
 )
 from markitect.validators.content_validator import (
    ContentValidator,
    ContentValidationResult
 )
@dataclass
 class SemanticValidationReport:
    """
    Report of semantic validation results.
    Combines results from section, content, and link validators.
    """
    section_result: SectionValidationResult
    content_result: Optional[ContentValidationResult] = None
    link_result: Optional[Any] = None  # LinkValidationResult when implemented
    def has_errors(self) -> bool:
        """Check if there are any ERROR-level issues."""
        errors = self.section_result.has_errors()
        if self.content_result and hasattr(self.content_result, 'has_errors'):
            errors = errors or self.content_result.has_errors()
        if self.link_result and hasattr(self.link_result, 'has_errors'):
            errors = errors or self.link_result.has_errors()
        return errors
    def has_warnings(self) -> bool:
        """Check if there are any WARNING-level issues."""
        warnings = self.section_result.has_warnings()
        if self.content_result and hasattr(self.content_result, 'has_warnings'):
            warnings = warnings or self.content_result.has_warnings()
        if self.link_result and hasattr(self.link_result, 'has_warnings'):
            warnings = warnings or self.link_result.has_warnings()
        return warnings
    def is_valid(self) -> bool:
        """Check if validation passed (no errors)."""
        return not self.has_errors()
    def get_all_issues(self) -> List[Any]:
        """Get all issues from all validators."""
        issues = list(self.section_result.issues)
        if self.content_result and hasattr(self.content_result, 'issues'):
            issues.extend(self.content_result.issues)
        if self.link_result and hasattr(self.link_result, 'issues'):
            issues.extend(self.link_result.issues)
        return issues
    def format_text(self) -> str:
        """Format validation report as text."""
        lines = []
        # Section validation
        lines.append("Section Validation:")
        if self.section_result.issues:
            for issue in self.section_result.issues:
                status = "❌" if issue.severity == 'ERROR' else "⚠️"
                lines.append(f"  {status} {issue.section_name} - {issue.message}")
        else:
            lines.append("  ✅ All section requirements met")
        # Content validation
        if self.content_result:
            lines.append("")
            lines.append("Content Validation:")
            if self.content_result.issues:
                for issue in self.content_result.issues:
                    status = "❌" if issue.severity == 'ERROR' else "⚠️"
                    lines.append(f"  {status} {issue.section_name} - {issue.message}")
            else:
                lines.append("  ✅ All content requirements met")
        # Summary
        lines.append("")
        lines.append("Summary:")
        lines.append(f"  Sections checked: {self.section_result.sections_checked}")
        lines.append(f"  Sections found: {self.section_result.sections_found}")
        all_errors = self.section_result.get_errors()
        all_warnings = self.section_result.get_warnings()
        if self.content_result:
            all_errors.extend(self.content_result.get_errors())
            all_warnings.extend(self.content_result.get_warnings())
        lines.append(f"  Errors: {len(all_errors)}")
        lines.append(f"  Warnings: {len(all_warnings)}")
        if self.is_valid():
            lines.append("  Status: PASSED ✅")
        else:
            lines.append("  Status: FAILED ❌")
        return "\n".join(lines)
 class SemanticValidator:
    """
    Validates markdown documents against x-markitect extensions.
    Complements existing SchemaValidator which handles structural AST validation.
    This validator checks semantic aspects defined in x-markitect-* extensions.
    Example:
        >>> schema = load_schema('manpage-schema-v1.0.md')
        >>> validator = SemanticValidator(schema)
        >>> report = validator.validate('my-command.1.md')
        >>> if not report.is_valid():
        ...     print(report.format_text())
    """
    def __init__(self, schema: Dict[str, Any]):
        """
        Initialize semantic validator with a schema.
        Args:
            schema: JSON schema with x-markitect-* extensions
        The schema can be either:
        - A dict loaded from JSON
        - A dict loaded from markdown with embedded JSON
        - Must contain x-markitect-sections and/or x-markitect-content-control
        """
        self.schema = schema
        # Initialize sub-validators
        self.section_validator = SectionValidator(schema)
        self.content_validator = ContentValidator(schema)
        # TODO: Initialize link validator when implemented
        # self.link_validator = LinkValidator(schema)
    def validate(self, document_path: str | Path,
                 check_links: bool = False) -> SemanticValidationReport:
        """
        Validate a markdown document against schema extensions.
        Args:
            document_path: Path to markdown document to validate
            check_links: Whether to validate links (may be slow)
        Returns:
            SemanticValidationReport with validation results
        Raises:
            FileNotFoundError: If document_path doesn't exist
            ValueError: If document cannot be parsed
        """
        document_path = Path(document_path)
        if not document_path.exists():
            raise FileNotFoundError(f"Document not found: {document_path}")
        # Parse document
        document = self._parse_document(document_path)
        # Run section validation
        section_result = self.section_validator.check(document)
        # Run content validation
        content_result = self.content_validator.check(document)
        # TODO: Run link validation when implemented
        # if check_links:
        #     link_result = self.link_validator.check(document)
        # else:
        #     link_result = None
        link_result = None
        return SemanticValidationReport(
            section_result=section_result,
            content_result=content_result,
            link_result=link_result
        )
    def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
        """
        Parse markdown document into AST.
        Args:
            document_path: Path to markdown file
        Returns:
            Parsed MarkdownDocument object
        This uses the existing markitect markdown parser.
        """
        # Import here to avoid circular dependency
        from markitect.document_manager import DocumentManager
        # Use DocumentManager to parse the document
        doc_manager = DocumentManager()
        doc = doc_manager.ingest_file(document_path)
        return doc
 def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
    """
    Load a schema from file (supports .json and .md formats).
    Args:
        schema_path: Path to schema file
    Returns:
        Schema dict with embedded JSON
    Raises:
        FileNotFoundError: If schema file doesn't exist
        ValueError: If schema cannot be parsed
    """
    schema_path = Path(schema_path)
    if not schema_path.exists():
        raise FileNotFoundError(f"Schema not found: {schema_path}")
    if schema_path.suffix == '.json':
        # Load JSON schema directly
        with open(schema_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    elif schema_path.suffix == '.md':
        # Load markdown schema with embedded JSON
        from markitect.schema_loader import MarkdownSchemaLoader
        loader = MarkdownSchemaLoader()
        schema_data = loader.load_schema(schema_path)
        return schema_data['schema']
    else:
        raise ValueError(f"Unsupported schema format: {schema_path.suffix}")
--- a/markitect/validators/init.py
+++ b/markitect/validators/init.py
@@ -0,0 +1,50 @@
 """
 Validators package for semantic document validation.
 This package contains validators that check markdown documents against
 x-markitect schema extensions (sections, content-control, link validation).
 Validators:
    - SectionValidator: Validates section presence based on classifications
    - ContentValidator: Validates content patterns and quality metrics
    - LinkValidator: Validates internal and external links
 """
 from markitect.validators.section_validator import (
    SectionValidator,
    SectionValidationResult,
    SectionIssue,
    SectionMissing,
    SectionImproper,
    SectionDiscouraged,
 )
 from markitect.validators.content_validator import (
    ContentValidator,
    ContentValidationResult,
    ContentIssue,
    PatternMissing,
    ForbiddenPattern,
    DiscouragedPattern,
    ContentTooShort,
    ContentTooLong,
 )
 __all__ = [
    # Section validator
    'SectionValidator',
    'SectionValidationResult',
    'SectionIssue',
    'SectionMissing',
    'SectionImproper',
    'SectionDiscouraged',
    # Content validator
    'ContentValidator',
    'ContentValidationResult',
    'ContentIssue',
    'PatternMissing',
    'ForbiddenPattern',
    'DiscouragedPattern',
    'ContentTooShort',
    'ContentTooLong',
 ]
--- a/markitect/validators/content_validator.py
+++ b/markitect/validators/content_validator.py
@@ -0,0 +1,316 @@
 """
 Content Validator for markdown documents.
 Validates content against x-markitect-content-control rules:
 - Required patterns: Regex patterns that must appear in content
 - Discouraged patterns: Patterns that should be avoided (warnings)
 - Forbidden patterns: Patterns that must not appear (errors)
 - Quality metrics: Word counts, sentence counts, readability
 """
 from dataclasses import dataclass
 from typing import List, Dict, Any, Optional
 import re
@dataclass
 class ContentIssue:
    """Base class for content validation issues."""
    section_name: str
    severity: str  # 'ERROR', 'WARNING', 'INFO'
    message: str
    line_number: Optional[int] = None
    matched_text: Optional[str] = None
    def __str__(self) -> str:
        location = f" (line {self.line_number})" if self.line_number else ""
        match_info = f": '{self.matched_text}'" if self.matched_text else ""
        return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"
@dataclass
 class PatternMissing(ContentIssue):
    """Required pattern not found in content."""
    pattern: str = ""
@dataclass
 class ForbiddenPattern(ContentIssue):
    """Forbidden pattern found in content."""
    pattern: str = ""
@dataclass
 class DiscouragedPattern(ContentIssue):
    """Discouraged pattern found in content."""
    pattern: str = ""
@dataclass
 class ContentTooShort(ContentIssue):
    """Content does not meet minimum word/sentence count."""
    actual: int = 0
    required: int = 0
@dataclass
 class ContentTooLong(ContentIssue):
    """Content exceeds maximum word/sentence count."""
    actual: int = 0
    limit: int = 0
@dataclass
 class ContentValidationResult:
    """Result of content validation."""
    issues: List[ContentIssue]
    sections_checked: int
    def has_errors(self) -> bool:
        """Check if there are any ERROR-level issues."""
        return any(issue.severity == 'ERROR' for issue in self.issues)
    def has_warnings(self) -> bool:
        """Check if there are any WARNING-level issues."""
        return any(issue.severity == 'WARNING' for issue in self.issues)
    def is_valid(self) -> bool:
        """Check if validation passed (no errors)."""
        return not self.has_errors()
    def get_errors(self) -> List[ContentIssue]:
        """Get all ERROR-level issues."""
        return [issue for issue in self.issues if issue.severity == 'ERROR']
    def get_warnings(self) -> List[ContentIssue]:
        """Get all WARNING-level issues."""
        return [issue for issue in self.issues if issue.severity == 'WARNING']
 class ContentValidator:
    """
    Validates content against x-markitect-content-control rules.
    Checks content patterns, quality metrics, and readability for each section.
    """
    def __init__(self, schema: Dict[str, Any]):
        """
        Initialize validator with a schema.
        Args:
            schema: JSON schema with x-markitect-content-control extension
        """
        self.schema = schema
        self.content_rules = schema.get('x-markitect-content-control', {})
    def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
        """
        Validate content against schema rules.
        Args:
            document: Parsed markdown document
        Returns:
            ContentValidationResult with any issues found
        """
        issues = []
        sections_checked = 0
        # Check each section that has content rules
        for section_key, rules in self.content_rules.items():
            sections_checked += 1
            # Get section from document
            section = self._get_section(document, section_key)
            if not section:
                # Section validator handles missing sections
                continue
            section_content = section.get('content', '')
            section_name = section.get('name', section_key)
            # Check required patterns
            issues.extend(self._check_required_patterns(
                section_name, section_content, rules
            ))
            # Check forbidden patterns
            issues.extend(self._check_forbidden_patterns(
                section_name, section_content, rules
            ))
            # Check discouraged patterns
            issues.extend(self._check_discouraged_patterns(
                section_name, section_content, rules
            ))
            # Check content quality metrics
            issues.extend(self._check_quality_metrics(
                section_name, section_content, rules
            ))
        return ContentValidationResult(
            issues=issues,
            sections_checked=sections_checked
        )
    def _get_section(self, document: 'MarkdownDocument',
                     section_key: str) -> Optional[Dict[str, Any]]:
        """
        Get a section from the document.
        Args:
            document: Parsed markdown document
            section_key: Section name (lowercase in rules, uppercase in document)
        Returns:
            Section dict with name and content, or None if not found
        """
        # Convert section_key to uppercase for matching
        section_name = section_key.upper()
        # Try to get section content
        if hasattr(document, 'get_section'):
            return document.get_section(section_name)
        # Fallback: search headings
        if hasattr(document, 'get_headings_by_level'):
            headings = document.get_headings_by_level(2)
            for heading in headings:
                if isinstance(heading, dict):
                    if heading.get('content', '').strip().upper() == section_name:
                        # Found the section, need to extract content
                        return {
                            'name': section_name,
                            'content': heading.get('text_content', '')
                        }
        return None
    def _check_required_patterns(self, section_name: str, content: str,
                                rules: Dict[str, Any]) -> List[ContentIssue]:
        """Check that all required patterns appear in content."""
        issues = []
        required_patterns = rules.get('required_patterns', [])
        for pattern in required_patterns:
            try:
                if not re.search(pattern, content, re.MULTILINE):
                    issues.append(PatternMissing(
                        section_name=section_name,
                        severity='ERROR',
                        message=f'Required pattern not found',
                        pattern=pattern
                    ))
            except re.error as e:
                # Invalid regex pattern in schema
                issues.append(ContentIssue(
                    section_name=section_name,
                    severity='ERROR',
                    message=f'Invalid regex pattern in schema: {e}'
                ))
        return issues
    def _check_forbidden_patterns(self, section_name: str, content: str,
                                  rules: Dict[str, Any]) -> List[ContentIssue]:
        """Check that no forbidden patterns appear in content."""
        issues = []
        forbidden_patterns = rules.get('forbidden_patterns', [])
        for pattern in forbidden_patterns:
            try:
                match = re.search(pattern, content, re.MULTILINE)
                if match:
                    issues.append(ForbiddenPattern(
                        section_name=section_name,
                        severity='ERROR',
                        message=f'Forbidden pattern found',
                        pattern=pattern,
                        matched_text=match.group(0)[:50]  # Limit to 50 chars
                    ))
            except re.error as e:
                issues.append(ContentIssue(
                    section_name=section_name,
                    severity='ERROR',
                    message=f'Invalid regex pattern in schema: {e}'
                ))
        return issues
    def _check_discouraged_patterns(self, section_name: str, content: str,
                                   rules: Dict[str, Any]) -> List[ContentIssue]:
        """Check for discouraged patterns (warnings)."""
        issues = []
        discouraged_patterns = rules.get('discouraged_patterns', [])
        for pattern in discouraged_patterns:
            try:
                match = re.search(pattern, content, re.MULTILINE)
                if match:
                    issues.append(DiscouragedPattern(
                        section_name=section_name,
                        severity='WARNING',
                        message=f'Discouraged pattern found',
                        pattern=pattern,
                        matched_text=match.group(0)[:50]
                    ))
            except re.error as e:
                issues.append(ContentIssue(
                    section_name=section_name,
                    severity='WARNING',
                    message=f'Invalid regex pattern in schema: {e}'
                ))
        return issues
    def _check_quality_metrics(self, section_name: str, content: str,
                              rules: Dict[str, Any]) -> List[ContentIssue]:
        """Check content quality metrics (word count, sentence count)."""
        issues = []
        quality = rules.get('content_quality', {})
        if not quality:
            return issues
        # Word count validation
        word_count = len(content.split())
        min_words = quality.get('min_words')
        if min_words is not None and word_count < min_words:
            issues.append(ContentTooShort(
                section_name=section_name,
                severity='WARNING',
                message=f'Content too short ({word_count} words, minimum {min_words})',
                actual=word_count,
                required=min_words
            ))
        max_words = quality.get('max_words')
        if max_words is not None and word_count > max_words:
            issues.append(ContentTooLong(
                section_name=section_name,
                severity='WARNING',
                message=f'Content too long ({word_count} words, maximum {max_words})',
                actual=word_count,
                limit=max_words
            ))
        # Sentence count validation
        min_sentences = quality.get('min_sentences')
        if min_sentences is not None:
            # Simple sentence count (split by .!?)
            sentence_count = len(re.findall(r'[.!?]+', content))
            if sentence_count < min_sentences:
                issues.append(ContentTooShort(
                    section_name=section_name,
                    severity='WARNING',
                    message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
                    actual=sentence_count,
                    required=min_sentences
                ))
        return issues
--- a/markitect/validators/section_validator.py
+++ b/markitect/validators/section_validator.py
@@ -0,0 +1,226 @@
 """
 Section Validator for markdown documents.
 Validates that document sections comply with x-markitect-sections classifications:
 - REQUIRED: Section must be present (ERROR if missing)
 - RECOMMENDED: Section should be present (WARNING if missing)
 - OPTIONAL: Section may be present (no check)
 - DISCOURAGED: Section should not be present (WARNING if present)
 - IMPROPER: Section must not be present (ERROR if present)
 """
 from dataclasses import dataclass
 from typing import List, Dict, Any, Optional
 from pathlib import Path
@dataclass
 class SectionIssue:
    """Base class for section validation issues."""
    section_name: str
    severity: str  # 'ERROR', 'WARNING', 'INFO'
    message: str
    classification: str  # 'required', 'recommended', etc.
    line_number: Optional[int] = None
    def __str__(self) -> str:
        location = f" (line {self.line_number})" if self.line_number else ""
        return f"[{self.severity}]{location} {self.section_name}: {self.message}"
@dataclass
 class SectionMissing(SectionIssue):
    """Section is missing from document."""
    pass
@dataclass
 class SectionImproper(SectionIssue):
    """Improper section found in document."""
    pass
@dataclass
 class SectionDiscouraged(SectionIssue):
    """Discouraged section found in document."""
    pass
@dataclass
 class SectionValidationResult:
    """Result of section validation."""
    issues: List[SectionIssue]
    sections_checked: int
    sections_found: int
    def has_errors(self) -> bool:
        """Check if there are any ERROR-level issues."""
        return any(issue.severity == 'ERROR' for issue in self.issues)
    def has_warnings(self) -> bool:
        """Check if there are any WARNING-level issues."""
        return any(issue.severity == 'WARNING' for issue in self.issues)
    def is_valid(self) -> bool:
        """Check if validation passed (no errors)."""
        return not self.has_errors()
    def get_errors(self) -> List[SectionIssue]:
        """Get all ERROR-level issues."""
        return [issue for issue in self.issues if issue.severity == 'ERROR']
    def get_warnings(self) -> List[SectionIssue]:
        """Get all WARNING-level issues."""
        return [issue for issue in self.issues if issue.severity == 'WARNING']
 class SectionValidator:
    """
    Validates section presence and classification compliance.
    Checks that markdown documents have the correct sections based on
    x-markitect-sections classifications in the schema.
    """
    def __init__(self, schema: Dict[str, Any]):
        """
        Initialize validator with a schema.
        Args:
            schema: JSON schema with x-markitect-sections extension
        """
        self.schema = schema
        self.sections_spec = schema.get('x-markitect-sections', {})
    def check(self, document: 'MarkdownDocument') -> SectionValidationResult:
        """
        Validate section presence against schema classifications.
        Args:
            document: Parsed markdown document
        Returns:
            SectionValidationResult with any issues found
        """
        issues = []
        # Get level-2 headings (main sections) from document
        doc_sections = self._get_document_sections(document)
        # Check each specification
        for section_name, spec in self.sections_spec.items():
            classification = spec.get('classification')
            section_in_doc = self._find_section(section_name, doc_sections, spec)
            if classification == 'required':
                if not section_in_doc:
                    issues.append(SectionMissing(
                        section_name=section_name,
                        severity='ERROR',
                        message=spec.get('error_message', f'{section_name} section is required'),
                        classification='required'
                    ))
            elif classification == 'improper':
                if section_in_doc:
                    issues.append(SectionImproper(
                        section_name=section_name,
                        severity='ERROR',
                        message=spec.get('error_message', f'{section_name} section must not appear'),
                        classification='improper',
                        line_number=section_in_doc.get('line_number')
                    ))
            elif classification == 'recommended':
                if not section_in_doc:
                    issues.append(SectionMissing(
                        section_name=section_name,
                        severity='WARNING',
                        message=spec.get('warning_if_missing', f'{section_name} section is recommended'),
                        classification='recommended'
                    ))
            elif classification == 'discouraged':
                if section_in_doc:
                    issues.append(SectionDiscouraged(
                        section_name=section_name,
                        severity='WARNING',
                        message=spec.get('warning_if_present', f'{section_name} section is discouraged'),
                        classification='discouraged',
                        line_number=section_in_doc.get('line_number')
                    ))
        return SectionValidationResult(
            issues=issues,
            sections_checked=len(self.sections_spec),
            sections_found=len(doc_sections)
        )
    def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
        """
        Extract level-2 headings from document.
        Args:
            document: Parsed markdown document
        Returns:
            List of section dicts with name and line_number
        """
        sections = []
        # Get headings from document
        if hasattr(document, 'get_headings_by_level'):
            level_2_headings = document.get_headings_by_level(2)
        elif hasattr(document, 'headings'):
            level_2_headings = [
                h for h in document.headings
                if h.get('level') == 2
            ]
        else:
            # Fallback: parse from AST
            level_2_headings = []
        for heading in level_2_headings:
            if isinstance(heading, dict):
                sections.append({
                    'name': heading.get('content', '').strip().upper(),
                    'line_number': heading.get('line_number')
                })
            elif isinstance(heading, str):
                sections.append({
                    'name': heading.strip().upper(),
                    'line_number': None
                })
        return sections
    def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]],
                     spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
        """
        Find a section in document, checking alternatives.
        Args:
            section_name: Primary section name to find
            doc_sections: List of sections in document
            spec: Section specification with potential alternatives
        Returns:
            Section dict if found, None otherwise
        """
        # Normalize section name for comparison
        normalized_name = section_name.upper().strip()
        # Check primary name
        for section in doc_sections:
            if section['name'] == normalized_name:
                return section
        # Check alternatives
        alternatives = spec.get('alternatives', [])
        for alt_name in alternatives:
            normalized_alt = alt_name.upper().strip()
            for section in doc_sections:
                if section['name'] == normalized_alt:
                    return section
        return None
--- a/roadmap/20260106-semantic-document-validation/WORKPLAN.md
+++ b/roadmap/20260106-semantic-document-validation/WORKPLAN.md
@@ -0,0 +1,573 @@
 # Plan: Schema System Enhancement - Semantic Document Validation
 ## Overview
 The schema management system has **complete schema structure analysis tools** (schema-analyze, schema-refine) and **structural AST validation** (markitect validate), but is missing **semantic validation capabilities**. This plan enhances validation to check sections, content patterns, and quality metrics defined in x-markitect extensions.
 ## Current State Assessment
 ### ✅ Already Implemented
 - **schema-analyze**: Detects rigid constraints, calculates rigidity score (markitect/schema_analyzer.py)
 - **schema-refine**: Automatically loosens rigid constraints (markitect/schema_refiner.py)
 - **markitect validate**: Validates AST structure against JSON schemas (cli.py:1493-1600)
  - Checks headings, paragraphs, code_blocks counts match schema
  - Validates document structure against JSON Schema properties
  - Does NOT check x-markitect-sections classifications
  - Does NOT validate x-markitect-content-control patterns
 - **X-Markitect Extensions**: Full system with sections, content-control, metadata
 - **Metaschema Validation**: Validates schema structure and extensions
 - **4 Production Schemas**: manpage, API docs, terminology, schema-schema
 - **Comprehensive Documentation**: User guides, specifications, tests (97 tests passing)
 ### ❌ Missing Capabilities (Semantic Validation)
 1. **Section Classification Enforcement**: required/recommended/optional/discouraged/improper not checked
 2. **Content Pattern Validation**: required_patterns, forbidden_patterns not matched
 3. **Quality Metrics Validation**: min_words, max_words, min_sentences not enforced
 4. **Link Validation**: Internal/external link checking not implemented
 5. **Content Instructions**: content_instruction fields defined but not validated
 ## What We Have vs What We Need
 **Current `markitect validate`** (Structural):
 ```bash
 markitect validate doc.md --schema schema.json
 # ✅ Checks: headings.level_2 has 5-30 items
 # ✅ Checks: paragraphs has 10-500 items
 # ✅ Checks: code_blocks has 1-50 items
 # ❌ Does NOT check: SYNOPSIS section present (required)
 # ❌ Does NOT check: INTERNAL_NOTES absent (improper)
 # ❌ Does NOT check: Synopsis contains bold command name
 # ❌ Does NOT check: Description has min 50 words
 ```
 **Enhanced `markitect validate`** (Structural + Semantic):
 ```bash
 markitect validate doc.md --schema manpage-schema-v1.0.md
 # ✅ Checks: AST structure (existing)
 # ✅ NEW: SYNOPSIS section present (required)
 # ✅ NEW: INTERNAL_NOTES not present (improper)
 # ✅ NEW: Synopsis contains **command** pattern
 # ✅ NEW: Description has 50+ words
 # ✅ NEW: No forbidden TODO patterns
 ```
 ## Implementation Plan
 ### Phase 1: Core Semantic Validator
 **Goal**: Create semantic validator to complement existing structural validation
 **New Module**: `markitect/semantic_validator.py`
 **Key Components**:
 ```python
 class SemanticValidator:
    """Validates markdown documents against x-markitect extensions.
    Complements existing SchemaValidator which handles structural AST validation.
    This validator checks semantic aspects defined in x-markitect-* extensions.
    """
    def __init__(self, schema_path: str):
        # Load schema (supports .md schemas with embedded JSON)
        self.schema = load_schema_with_extensions(schema_path)
        # Initialize sub-validators
        self.section_validator = SectionValidator(self.schema)
        self.content_validator = ContentValidator(self.schema)
        self.link_validator = LinkValidator(self.schema)
    def validate(self, document_path: str, check_links: bool = False) -> SemanticValidationReport:
        """Main semantic validation entry point."""
        doc = parse_markdown_document(document_path)
        results = {
            'sections': self.section_validator.check(doc),
            'content': self.content_validator.check(doc)
        }
        if check_links:
            results['links'] = self.link_validator.check(doc)
        return SemanticValidationReport(results)
 ```
 **Features**:
 - Load schema from registry or filesystem
 - Parse markdown document into AST
 - Validate sections against x-markitect-sections classifications
 - Check content against x-markitect-content-control patterns
 - Validate links if enabled
 - Generate detailed report with line numbers
 ### Phase 2: Section Presence Validator
 **New Module**: `markitect/section_validator.py`
 **Validation Rules**:
 ```python
 class SectionValidator:
    """Validates section presence and classification compliance."""
    def check(self, document: MarkdownDocument) -> SectionValidationResult:
        sections_spec = self.schema.get('x-markitect-sections', {})
        doc_sections = document.get_headings_by_level(2)
        issues = []
        # Check REQUIRED sections
        for section_name, spec in sections_spec.items():
            if spec['classification'] == 'required':
                if section_name not in doc_sections:
                    issues.append(SectionMissing(
                        section=section_name,
                        severity='ERROR',
                        message=spec.get('error_message', f'{section_name} is required')
                    ))
        # Check IMPROPER sections (must not exist)
        for section_name, spec in sections_spec.items():
            if spec['classification'] == 'improper':
                if section_name in doc_sections:
                    issues.append(SectionImproper(
                        section=section_name,
                        severity='ERROR',
                        message=spec.get('error_message', f'{section_name} must not appear')
                    ))
        # Check RECOMMENDED sections (warnings)
        for section_name, spec in sections_spec.items():
            if spec['classification'] == 'recommended':
                if section_name not in doc_sections:
                    issues.append(SectionMissing(
                        section=section_name,
                        severity='WARNING',
                        message=spec.get('warning_if_missing', f'{section_name} is recommended')
                    ))
        return SectionValidationResult(issues)
 ```
 **Section Classification Enforcement**:
 - REQUIRED → ERROR if missing
 - RECOMMENDED → WARNING if missing
 - OPTIONAL → No check
 - DISCOURAGED → WARNING if present
 - IMPROPER → ERROR if present
 ### Phase 3: Content Pattern Validator
 **New Module**: `markitect/content_validator.py`
 **Pattern Matching**:
 ```python
 class ContentValidator:
    """Validates content against x-markitect-content-control rules."""
    def check(self, document: MarkdownDocument) -> ContentValidationResult:
        content_rules = self.schema.get('x-markitect-content-control', {})
        issues = []
        for section_key, rules in content_rules.items():
            section = document.get_section(section_key.upper())
            if not section:
                continue  # Section validator handles missing sections
            # Check required patterns
            for pattern in rules.get('required_patterns', []):
                if not re.search(pattern, section.content):
                    issues.append(PatternMissing(
                        section=section.name,
                        pattern=pattern,
                        severity='ERROR'
                    ))
            # Check forbidden patterns
            for pattern in rules.get('forbidden_patterns', []):
                if re.search(pattern, section.content):
                    issues.append(ForbiddenPattern(
                        section=section.name,
                        pattern=pattern,
                        severity='ERROR',
                        matched_text=match.group(0)
                    ))
            # Check content quality
            quality = rules.get('content_quality', {})
            word_count = len(section.content.split())
            if 'min_words' in quality and word_count < quality['min_words']:
                issues.append(ContentTooShort(
                    section=section.name,
                    actual=word_count,
                    required=quality['min_words'],
                    severity='WARNING'
                ))
            if 'max_words' in quality and word_count > quality['max_words']:
                issues.append(ContentTooLong(
                    section=section.name,
                    actual=word_count,
                    limit=quality['max_words'],
                    severity='WARNING'
                ))
        return ContentValidationResult(issues)
 ```
 **Content Rules Checked**:
 - Required patterns (regex matches)
 - Discouraged patterns (warnings)
 - Forbidden patterns (errors)
 - Word count ranges (min/max)
 - Sentence counts (if specified)
 ### Phase 4: Link Validator
 **New Module**: `markitect/link_validator.py`
 **Link Checking**:
 ```python
 class LinkValidator:
    """Validates links according to x-markitect-content-control.link_validation."""
    def check(self, document: MarkdownDocument) -> LinkValidationResult:
        link_config = self.schema.get('x-markitect-content-control', {}).get('link_validation', {})
        if not any(link_config.values()):
            return LinkValidationResult([])  # No link validation configured
        links = document.extract_links()
        issues = []
        for link in links:
            # Check internal links
            if link.is_internal() and link_config.get('check_internal', False):
                target = document.resolve_internal_link(link.target)
                if not target:
                    issues.append(BrokenInternalLink(
                        link=link.target,
                        line=link.line_number,
                        severity='ERROR'
                    ))
            # Check external links
            if link.is_external() and link_config.get('check_external', False):
                # HTTP HEAD request with timeout
                if not self._check_url_exists(link.target):
                    issues.append(BrokenExternalLink(
                        link=link.target,
                        line=link.line_number,
                        severity='WARNING'  # External links are warnings
                    ))
            # Check fragments
            if link.has_fragment() and not link_config.get('allow_fragments', True):
                issues.append(FragmentNotAllowed(
                    link=link.target,
                    line=link.line_number,
                    severity='WARNING'
                ))
        return LinkValidationResult(issues)
 ```
 **Link Types Validated**:
 - Internal links (to other sections/documents)
 - External links (HTTP/HTTPS URLs)
 - Fragment identifiers (#section-name)
 - Email links (mailto:)
 ### Phase 5: CLI Integration
 **Enhance Existing Command**: `markitect validate` (cli.py:1493-1600)
 **New Options to Add**:
 ```python
@cli.command('validate')
@click.argument('file_path', type=click.Path(exists=True, path_type=Path))
@click.option('--schema', '-s', type=click.Path(exists=True, path_type=Path),
              help='Path to JSON schema file')
@click.option('--schema-json', type=str,
              help='JSON schema provided as a string')
@click.option('--quiet', '-q', is_flag=True,
              help='Only output validation result (true/false)')
@click.option('--detailed-errors', '--errors', is_flag=True,
              help='Show detailed validation errors (Issue #8)')
@click.option('--error-format', type=click.Choice(['text', 'json', 'markdown']), default='text',
              help='Format for detailed error output')
 # NEW OPTIONS:
@click.option('--semantic/--no-semantic', default=True,
              help='Enable/disable semantic validation (sections, patterns, quality)')
@click.option('--check-links', is_flag=True,
              help='Enable link validation (may be slow)')
@click.option('--strict', is_flag=True,
              help='Treat warnings as errors')
@pass_config
 def validate(config, file_path, schema, schema_json, quiet, detailed_errors, error_format,
             semantic, check_links, strict):
    """
    Validate a markdown file against a JSON schema.
    ENHANCED: Now includes semantic validation of x-markitect extensions:
    - Section classifications (required, recommended, optional, discouraged, improper)
    - Content patterns (required_patterns, forbidden_patterns)
    - Quality metrics (min_words, max_words, min_sentences)
    - Link validation (internal/external)
    Examples:
        # Structural + semantic validation (default)
        markitect validate doc.md --schema manpage-schema-v1.0.md
        # Only structural validation (classic mode)
        markitect validate doc.md --schema schema.json --no-semantic
        # With link checking
        markitect validate doc.md --schema 1 --check-links
        # Strict mode (warnings become errors)
        markitect validate doc.md --schema manpage-schema-v1.0.md --strict
    """
    # Existing structural validation code...
    # (Keep all existing logic for SchemaValidator)
    # NEW: Add semantic validation if enabled and schema has x-markitect extensions
    if semantic:
        semantic_validator = SemanticValidator(schema_path)
        semantic_report = semantic_validator.validate(file_path, check_links=check_links)
        # Combine structural and semantic results
        combined_report = CombinedValidationReport(structural_result, semantic_report)
        # Output combined results
        if not quiet:
            click.echo(combined_report.format(error_format))
        # Exit codes
        if combined_report.has_errors():
            sys.exit(1)
        elif strict and combined_report.has_warnings():
            sys.exit(1)
 ```
 **Integration Strategy**:
 1. Keep existing structural validation (SchemaValidator) unchanged
 2. Add new semantic validation layer on top
 3. Use --no-semantic flag to disable new validation (backward compatibility)
 4. Combine structural + semantic results in unified report
 5. Default to semantic=True for new markdown schemas with extensions
 **Output Format** (text):
 ```
 Validating: my-command.1.md
 Schema: manpage-schema-v1.0.md (v1.0.0)
 Section Validation:
  ✅ SYNOPSIS - Present (required)
  ✅ DESCRIPTION - Present (required)
  ⚠️  EXAMPLES - Missing (recommended)
  ❌ INTERNAL_NOTES - Must not appear (improper)
 Content Validation:
  ✅ SYNOPSIS - Patterns matched
  ⚠️  DESCRIPTION - Too short (35 words, minimum 50)
  ❌ SYNOPSIS - Forbidden pattern found: "TODO"
 Link Validation: (skipped - use --check-links)
 Summary:
  Errors: 2
  Warnings: 2
  Status: FAILED ❌
 Failed validations:
  Line 12: INTERNAL_NOTES section must not appear in published manpages
  Line 5: SYNOPSIS contains forbidden pattern "TODO"
 ```
 ### Phase 6: Batch Document Validation
 **New Command**: `markitect validate-batch`
 ```python
@cli.command('validate-batch')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option('--schema', '-s', type=str, required=True)
@click.option('--pattern', default='*.md', help='File pattern to match')
@click.option('--strict', is_flag=True)
@click.option('--summary-only', is_flag=True, help='Show only summary table')
@pass_config
 def validate_batch_cmd(config, directory, schema, pattern, strict, summary_only):
    """Validate multiple documents in a directory.
    Example:
        markitect validate-batch docs/manpages/ --schema manpage-schema-v1.0.md
    """
    # Find all matching documents
    docs = list(Path(directory).glob(pattern))
    # Validate each
    results = []
    for doc in docs:
        validator = DocumentValidator(schema)
        report = validator.validate(doc)
        results.append((doc.name, report))
    # Show summary table
    display_batch_results(results)
 ```
 ## Implementation Phases
 ### Phase 1 (Core - 1 session)
 - DocumentValidator class
 - Basic section validation
 - CLI validate command
 - Simple text output format
 ### Phase 2 (Content - 1 session)
 - ContentValidator with pattern matching
 - Word count validation
 - Quality metrics checking
 - Enhanced reporting
 ### Phase 3 (Links - 1 session)
 - LinkValidator with internal link checking
 - Optional external link validation
 - Fragment validation
 - Performance optimization (caching)
 ### Phase 4 (Polish - 1 session)
 - Batch validation support
 - JSON/table output formats
 - Integration tests
 - Documentation updates
 ## Critical Files
 **New Files**:
 - `markitect/semantic_validator.py` - Main semantic validator (complements existing SchemaValidator)
 - `markitect/validators/section_validator.py` - Section classification enforcement
 - `markitect/validators/content_validator.py` - Content pattern matching and quality
 - `markitect/validators/link_validator.py` - Link validation
 - `markitect/validators/__init__.py` - Validators package
 - `tests/test_semantic_validator.py` - Semantic validator tests
 - `tests/validators/test_section_validator.py` - Section validator tests
 - `tests/validators/test_content_validator.py` - Content validator tests
 - `tests/validators/test_link_validator.py` - Link validator tests
 **Modified Files**:
 - `markitect/cli.py` (lines 1493-1600) - Enhance validate command with semantic validation
 - `markitect/schema_loader.py` - May need utility to extract x-markitect extensions
 - `docs/SCHEMA_MANAGEMENT_GUIDE.md` - Add semantic validation section
 - `examples/manpages/README.md` - Add validation examples
 - `examples/terminology/README.md` - Add validation examples
 **Reference Files** (unchanged, used for integration):
 - `markitect/validator.py` - Existing SchemaValidator for structural validation
 - `markitect/schema_analyzer.py` - Reference for schema extension parsing
 ## Design Decisions
 ### 1. Markdown Parsing
 **Decision**: Use existing markdown parser from markitect core
 **Rationale**: Already handles frontmatter, sections, AST generation
 ### 2. Link Validation Default
 **Decision**: Internal links checked by default, external links opt-in
 **Rationale**: External link checking is slow (network requests), internal is fast
 ### 3. Severity Levels
 **Decision**: ERROR (required violations), WARNING (recommended violations), INFO (suggestions)
 **Rationale**: Matches schema classification system semantics
 ### 4. Exit Codes
 **Decision**: 0=success, 1=validation failed, 2=system error
 **Rationale**: Standard CLI conventions for CI/CD integration
 ### 5. Pattern Syntax
 **Decision**: Use Python regex patterns directly
 **Rationale**: Schemas already use regex strings, no need for new syntax
 ## Testing Strategy
 ### Unit Tests
 - SectionValidator: Test all classification types
 - ContentValidator: Test pattern matching, word counts
 - LinkValidator: Test internal/external link checking
 - ValidationReport: Test formatting and aggregation
 ### Integration Tests
 - Validate real manpage documents against manpage schema
 - Validate terminology documents against terminology schema
 - Test batch validation across multiple documents
 - Test CLI output formats
 ### Edge Cases
 - Documents with no schema sections defined
 - Schemas with no content-control rules
 - Empty documents
 - Documents with malformed links
 - Unicode in patterns and content
 ## User Workflows
 ### Workflow 1: Validate Single Document
 ```bash
 # Validate a manpage
 markitect validate my-command.1.md --schema manpage-schema-v1.0.md
 # With link checking
 markitect validate my-command.1.md --schema 1 --check-links
 ```
 ### Workflow 2: CI/CD Integration
 ```bash
 #!/bin/bash
 # Validate all manpages in CI
 if ! markitect validate-batch docs/man/ --schema 1 --strict; then
    echo "Manpage validation failed!"
    exit 1
 fi
 ```
 ### Workflow 3: Pre-commit Hook
 ```bash
 # .git/hooks/pre-commit
 files=$(git diff --cached --name-only --diff-filter=ACM | grep '\.1\.md$')
 for file in $files; do
    if ! markitect validate "$file" --schema manpage-schema-v1.0.md; then
        echo "Fix validation errors before committing"
        exit 1
    fi
 done
 ```
 ### Workflow 4: Interactive Editing
 ```bash
 # Validate while editing
 watch -n 2 'markitect validate draft.md --schema api-documentation-schema-v1.0.md'
 ```
 ## Success Metrics
 1. **Core Functionality**: Can validate documents against all 4 production schemas
 2. **Classification Enforcement**: Required/improper sections properly checked
 3. **Pattern Matching**: Content patterns validated with regex
 4. **Performance**: Validate 100 documents in < 5 seconds (without link checking)
 5. **Test Coverage**: > 90% coverage for new validator modules
 6. **Documentation**: Complete examples for each schema type
 ## Future Enhancements (Out of Scope)
 - Auto-fixing document validation errors
 - Suggestion engine for missing content
 - Readability scoring with specific algorithms
 - Image validation (size, format, accessibility)
 - Schema evolution analysis (breaking changes between versions)
 - Document-to-schema generation (inverse of current flow)
--- a/tests/test_semantic_validator.py
+++ b/tests/test_semantic_validator.py
@@ -0,0 +1,506 @@
 """
 Tests for SemanticValidator.
 Tests semantic validation of markdown documents against x-markitect extensions.
 """
 import pytest
 from pathlib import Path
 import tempfile
 import json
 from markitect.semantic_validator import (
    SemanticValidator,
    SemanticValidationReport,
    load_schema_from_path
 )
 from markitect.validators.section_validator import (
    SectionValidator,
    SectionMissing,
    SectionImproper
 )
 from markitect.validators.content_validator import (
    ContentValidator,
    PatternMissing,
    ForbiddenPattern,
    DiscouragedPattern,
    ContentTooShort,
    ContentTooLong
 )
 class TestSectionValidator:
    """Test section validation functionality."""
    def test_required_section_missing(self):
        """Test that missing required sections are detected as errors."""
        schema = {
            'x-markitect-sections': {
                'SYNOPSIS': {
                    'classification': 'required',
                    'heading_level': 2,
                    'error_message': 'SYNOPSIS section is mandatory'
                }
            }
        }
        validator = SectionValidator(schema)
        # Create a mock document without SYNOPSIS
        class MockDocument:
            def get_headings_by_level(self, level):
                return ['DESCRIPTION', 'EXAMPLES']
        doc = MockDocument()
        result = validator.check(doc)
        # Should have one error
        assert not result.is_valid()
        assert result.has_errors()
        assert len(result.get_errors()) == 1
        error = result.get_errors()[0]
        assert isinstance(error, SectionMissing)
        assert error.section_name == 'SYNOPSIS'
        assert error.severity == 'ERROR'
        assert 'mandatory' in error.message
    def test_improper_section_present(self):
        """Test that improper sections are detected as errors."""
        schema = {
            'x-markitect-sections': {
                'INTERNAL_NOTES': {
                    'classification': 'improper',
                    'heading_level': 2,
                    'error_message': 'Internal notes must not appear in published docs'
                }
            }
        }
        validator = SectionValidator(schema)
        # Create a mock document with INTERNAL_NOTES
        class MockDocument:
            def get_headings_by_level(self, level):
                return [
                    {
                        'content': 'INTERNAL_NOTES',
                        'level': 2,
                        'line_number': 25
                    }
                ]
        doc = MockDocument()
        result = validator.check(doc)
        # Should have one error
        assert not result.is_valid()
        assert result.has_errors()
        assert len(result.get_errors()) == 1
        error = result.get_errors()[0]
        assert isinstance(error, SectionImproper)
        assert error.section_name == 'INTERNAL_NOTES'
        assert error.severity == 'ERROR'
        assert error.line_number == 25
    def test_recommended_section_missing(self):
        """Test that missing recommended sections generate warnings."""
        schema = {
            'x-markitect-sections': {
                'EXAMPLES': {
                    'classification': 'recommended',
                    'heading_level': 2,
                    'warning_if_missing': 'Examples improve documentation quality'
                }
            }
        }
        validator = SectionValidator(schema)
        # Create a mock document without EXAMPLES
        class MockDocument:
            def get_headings_by_level(self, level):
                return ['SYNOPSIS', 'DESCRIPTION']
        doc = MockDocument()
        result = validator.check(doc)
        # Should pass validation (warnings don't fail)
        assert result.is_valid()
        assert not result.has_errors()
        assert result.has_warnings()
        assert len(result.get_warnings()) == 1
        warning = result.get_warnings()[0]
        assert warning.section_name == 'EXAMPLES'
        assert warning.severity == 'WARNING'
    def test_all_required_sections_present(self):
        """Test that validation passes when all required sections present."""
        schema = {
            'x-markitect-sections': {
                'SYNOPSIS': {
                    'classification': 'required',
                    'heading_level': 2
                },
                'DESCRIPTION': {
                    'classification': 'required',
                    'heading_level': 2
                }
            }
        }
        validator = SectionValidator(schema)
        # Create a mock document with all required sections
        class MockDocument:
            def get_headings_by_level(self, level):
                return [
                    {'content': 'SYNOPSIS', 'level': 2},
                    {'content': 'DESCRIPTION', 'level': 2},
                    {'content': 'EXAMPLES', 'level': 2}
                ]
        doc = MockDocument()
        result = validator.check(doc)
        # Should pass
        assert result.is_valid()
        assert not result.has_errors()
        assert not result.has_warnings()
        assert len(result.issues) == 0
    def test_section_alternatives(self):
        """Test that alternative section names are recognized."""
        schema = {
            'x-markitect-sections': {
                'OPTIONS': {
                    'classification': 'required',
                    'heading_level': 2,
                    'alternatives': ['FLAGS', 'COMMAND OPTIONS']
                }
            }
        }
        validator = SectionValidator(schema)
        # Document uses alternative name 'FLAGS'
        class MockDocument:
            def get_headings_by_level(self, level):
                return [{'content': 'FLAGS', 'level': 2}]
        doc = MockDocument()
        result = validator.check(doc)
        # Should pass (alternative is accepted)
        assert result.is_valid()
        assert not result.has_errors()
 class TestSemanticValidator:
    """Test complete semantic validation."""
    def test_validator_initialization(self):
        """Test that validator initializes correctly."""
        schema = {
            '$schema': 'http://json-schema.org/draft-07/schema#',
            'x-markitect-sections': {
                'SYNOPSIS': {'classification': 'required', 'heading_level': 2}
            }
        }
        validator = SemanticValidator(schema)
        assert validator.schema == schema
        assert validator.section_validator is not None
    def test_validation_report_formatting(self):
        """Test that validation reports format correctly."""
        from markitect.validators.section_validator import (
            SectionValidationResult,
            SectionMissing
        )
        section_result = SectionValidationResult(
            issues=[
                SectionMissing(
                    section_name='SYNOPSIS',
                    severity='ERROR',
                    message='SYNOPSIS is required',
                    classification='required'
                )
            ],
            sections_checked=2,
            sections_found=1
        )
        report = SemanticValidationReport(section_result=section_result)
        # Check report properties
        assert report.has_errors()
        assert not report.is_valid()
        # Check text formatting
        text = report.format_text()
        assert 'Section Validation:' in text
        assert 'SYNOPSIS' in text
        assert 'Errors: 1' in text
        assert 'FAILED' in text
    def test_load_json_schema(self, tmp_path):
        """Test loading a JSON schema file."""
        schema_file = tmp_path / "test-schema.json"
        schema_data = {
            '$schema': 'http://json-schema.org/draft-07/schema#',
            'title': 'Test Schema',
            'x-markitect-sections': {
                'SYNOPSIS': {'classification': 'required', 'heading_level': 2}
            }
        }
        schema_file.write_text(json.dumps(schema_data, indent=2))
        loaded_schema = load_schema_from_path(schema_file)
        assert loaded_schema == schema_data
        assert 'x-markitect-sections' in loaded_schema
    def test_schema_not_found(self):
        """Test that missing schema file raises error."""
        with pytest.raises(FileNotFoundError):
            load_schema_from_path('/nonexistent/schema.json')
    def test_unsupported_schema_format(self, tmp_path):
        """Test that unsupported format raises error."""
        schema_file = tmp_path / "schema.xml"
        schema_file.write_text('<schema></schema>')
        with pytest.raises(ValueError, match="Unsupported schema format"):
            load_schema_from_path(schema_file)
 class TestContentValidator:
    """Test content validation functionality."""
    def test_required_pattern_missing(self):
        """Test that missing required patterns are detected."""
        schema = {
            'x-markitect-content-control': {
                'synopsis': {
                    'required_patterns': [
                        r'\*\*[a-z][a-z0-9-]*\*\*'  # Bold command name
                    ]
                }
            }
        }
        validator = ContentValidator(schema)
        # Create mock document without bold command
        class MockDocument:
            def get_section(self, name):
                if name == 'SYNOPSIS':
                    return {
                        'name': 'SYNOPSIS',
                        'content': 'command [options] arguments'  # No bold
                    }
                return None
        doc = MockDocument()
        result = validator.check(doc)
        # Should have one error
        assert not result.is_valid()
        assert result.has_errors()
        assert len(result.get_errors()) == 1
        error = result.get_errors()[0]
        assert isinstance(error, PatternMissing)
        assert error.section_name == 'SYNOPSIS'
        assert error.severity == 'ERROR'
    def test_forbidden_pattern_found(self):
        """Test that forbidden patterns are detected."""
        schema = {
            'x-markitect-content-control': {
                'description': {
                    'forbidden_patterns': [
                        r'\bTODO\b',
                        r'\bFIXME\b'
                    ]
                }
            }
        }
        validator = ContentValidator(schema)
        # Create mock document with forbidden pattern
        class MockDocument:
            def get_section(self, name):
                if name == 'DESCRIPTION':
                    return {
                        'name': 'DESCRIPTION',
                        'content': 'This is a description. TODO: Add more details.'
                    }
                return None
        doc = MockDocument()
        result = validator.check(doc)
        # Should have one error
        assert not result.is_valid()
        assert result.has_errors()
        assert len(result.get_errors()) == 1
        error = result.get_errors()[0]
        assert isinstance(error, ForbiddenPattern)
        assert error.section_name == 'DESCRIPTION'
        assert 'TODO' in error.matched_text
    def test_discouraged_pattern_warning(self):
        """Test that discouraged patterns generate warnings."""
        schema = {
            'x-markitect-content-control': {
                'description': {
                    'discouraged_patterns': [
                        r'\bWIP\b'
                    ]
                }
            }
        }
        validator = ContentValidator(schema)
        # Create mock document with discouraged pattern
        class MockDocument:
            def get_section(self, name):
                if name == 'DESCRIPTION':
                    return {
                        'name': 'DESCRIPTION',
                        'content': 'This is WIP content.'
                    }
                return None
        doc = MockDocument()
        result = validator.check(doc)
        # Should pass (warnings don't fail)
        assert result.is_valid()
        assert not result.has_errors()
        assert result.has_warnings()
        warning = result.get_warnings()[0]
        assert isinstance(warning, DiscouragedPattern)
        assert warning.severity == 'WARNING'
    def test_content_too_short(self):
        """Test word count validation - too short."""
        schema = {
            'x-markitect-content-control': {
                'description': {
                    'content_quality': {
                        'min_words': 50,
                        'max_words': 1000
                    }
                }
            }
        }
        validator = ContentValidator(schema)
        # Create mock document with short content
        class MockDocument:
            def get_section(self, name):
                if name == 'DESCRIPTION':
                    return {
                        'name': 'DESCRIPTION',
                        'content': 'Short description.'  # Only 2 words
                    }
                return None
        doc = MockDocument()
        result = validator.check(doc)
        # Should have warning
        assert result.is_valid()  # Warnings don't fail
        assert result.has_warnings()
        warning = result.get_warnings()[0]
        assert isinstance(warning, ContentTooShort)
        assert warning.actual == 2
        assert warning.required == 50
    def test_content_too_long(self):
        """Test word count validation - too long."""
        schema = {
            'x-markitect-content-control': {
                'synopsis': {
                    'content_quality': {
                        'min_words': 5,
                        'max_words': 20
                    }
                }
            }
        }
        validator = ContentValidator(schema)
        # Create mock document with long content
        class MockDocument:
            def get_section(self, name):
                if name == 'SYNOPSIS':
                    return {
                        'name': 'SYNOPSIS',
                        'content': ' '.join(['word'] * 50)  # 50 words
                    }
                return None
        doc = MockDocument()
        result = validator.check(doc)
        # Should have warning
        assert result.is_valid()
        assert result.has_warnings()
        warning = result.get_warnings()[0]
        assert isinstance(warning, ContentTooLong)
        assert warning.actual == 50
        assert warning.limit == 20
    def test_all_content_requirements_met(self):
        """Test that validation passes when all requirements met."""
        schema = {
            'x-markitect-content-control': {
                'synopsis': {
                    'required_patterns': [
                        r'\*\*[a-z]+\*\*'
                    ],
                    'content_quality': {
                        'min_words': 5,
                        'max_words': 50
                    }
                }
            }
        }
        validator = ContentValidator(schema)
        # Create valid document
        class MockDocument:
            def get_section(self, name):
                if name == 'SYNOPSIS':
                    return {
                        'name': 'SYNOPSIS',
                        'content': '**command** [options] arguments and more words here'
                    }
                return None
        doc = MockDocument()
        result = validator.check(doc)
        # Should pass
        assert result.is_valid()
        assert not result.has_errors()
        assert not result.has_warnings()
        assert len(result.issues) == 0