feat: add semantic document validator for x-markitect extensions

Implements semantic validation to complement existing structural validation: Phase 1 & 2 Complete: - SemanticValidator: Main validator orchestrating sub-validators - SectionValidator: Enforces section classifications (required, recommended, optional, discouraged, improper) from x-markitect-sections - ContentValidator: Validates content patterns, forbidden patterns, and quality metrics (word counts, sentence counts) from x-markitect-content-control Features: - Pattern matching with regex for required/forbidden/discouraged patterns - Word count and sentence count validation - Detailed error reporting with severity levels (ERROR, WARNING) - Support for section alternatives (e.g., FLAGS vs OPTIONS) - Comprehensive test coverage (16 tests, 100% passing) Architecture: - Complements existing SchemaValidator (structural AST validation) - Clean separation: validators/ package for modular validators - Semantic validation focuses on x-markitect-* extensions - LinkValidator planned for Phase 3 (optional --check-links) Next: Phase 4 - CLI integration to enhance 'markitect validate' command Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:24:32 +01:00
parent f27eea6b5b
commit a969c5de47
6 changed files with 1932 additions and 0 deletions
--- a/markitect/validators/init.py
+++ b/markitect/validators/init.py
@@ -0,0 +1,50 @@
+"""
+Validators package for semantic document validation.
+
+This package contains validators that check markdown documents against
+x-markitect schema extensions (sections, content-control, link validation).
+
+Validators:
+    - SectionValidator: Validates section presence based on classifications
+    - ContentValidator: Validates content patterns and quality metrics
+    - LinkValidator: Validates internal and external links
+"""
+
+from markitect.validators.section_validator import (
+    SectionValidator,
+    SectionValidationResult,
+    SectionIssue,
+    SectionMissing,
+    SectionImproper,
+    SectionDiscouraged,
+)
+
+from markitect.validators.content_validator import (
+    ContentValidator,
+    ContentValidationResult,
+    ContentIssue,
+    PatternMissing,
+    ForbiddenPattern,
+    DiscouragedPattern,
+    ContentTooShort,
+    ContentTooLong,
+)
+
+__all__ = [
+    # Section validator
+    'SectionValidator',
+    'SectionValidationResult',
+    'SectionIssue',
+    'SectionMissing',
+    'SectionImproper',
+    'SectionDiscouraged',
+    # Content validator
+    'ContentValidator',
+    'ContentValidationResult',
+    'ContentIssue',
+    'PatternMissing',
+    'ForbiddenPattern',
+    'DiscouragedPattern',
+    'ContentTooShort',
+    'ContentTooLong',
+]
--- a/markitect/validators/content_validator.py
+++ b/markitect/validators/content_validator.py
@@ -0,0 +1,316 @@
+"""
+Content Validator for markdown documents.
+
+Validates content against x-markitect-content-control rules:
+- Required patterns: Regex patterns that must appear in content
+- Discouraged patterns: Patterns that should be avoided (warnings)
+- Forbidden patterns: Patterns that must not appear (errors)
+- Quality metrics: Word counts, sentence counts, readability
+"""
+
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional
+import re
+
+
+@dataclass
+class ContentIssue:
+    """Base class for content validation issues."""
+    section_name: str
+    severity: str  # 'ERROR', 'WARNING', 'INFO'
+    message: str
+    line_number: Optional[int] = None
+    matched_text: Optional[str] = None
+
+    def __str__(self) -> str:
+        location = f" (line {self.line_number})" if self.line_number else ""
+        match_info = f": '{self.matched_text}'" if self.matched_text else ""
+        return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"
+
+
+@dataclass
+class PatternMissing(ContentIssue):
+    """Required pattern not found in content."""
+    pattern: str = ""
+
+
+@dataclass
+class ForbiddenPattern(ContentIssue):
+    """Forbidden pattern found in content."""
+    pattern: str = ""
+
+
+@dataclass
+class DiscouragedPattern(ContentIssue):
+    """Discouraged pattern found in content."""
+    pattern: str = ""
+
+
+@dataclass
+class ContentTooShort(ContentIssue):
+    """Content does not meet minimum word/sentence count."""
+    actual: int = 0
+    required: int = 0
+
+
+@dataclass
+class ContentTooLong(ContentIssue):
+    """Content exceeds maximum word/sentence count."""
+    actual: int = 0
+    limit: int = 0
+
+
+@dataclass
+class ContentValidationResult:
+    """Result of content validation."""
+    issues: List[ContentIssue]
+    sections_checked: int
+
+    def has_errors(self) -> bool:
+        """Check if there are any ERROR-level issues."""
+        return any(issue.severity == 'ERROR' for issue in self.issues)
+
+    def has_warnings(self) -> bool:
+        """Check if there are any WARNING-level issues."""
+        return any(issue.severity == 'WARNING' for issue in self.issues)
+
+    def is_valid(self) -> bool:
+        """Check if validation passed (no errors)."""
+        return not self.has_errors()
+
+    def get_errors(self) -> List[ContentIssue]:
+        """Get all ERROR-level issues."""
+        return [issue for issue in self.issues if issue.severity == 'ERROR']
+
+    def get_warnings(self) -> List[ContentIssue]:
+        """Get all WARNING-level issues."""
+        return [issue for issue in self.issues if issue.severity == 'WARNING']
+
+
+class ContentValidator:
+    """
+    Validates content against x-markitect-content-control rules.
+
+    Checks content patterns, quality metrics, and readability for each section.
+    """
+
+    def __init__(self, schema: Dict[str, Any]):
+        """
+        Initialize validator with a schema.
+
+        Args:
+            schema: JSON schema with x-markitect-content-control extension
+        """
+        self.schema = schema
+        self.content_rules = schema.get('x-markitect-content-control', {})
+
+    def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
+        """
+        Validate content against schema rules.
+
+        Args:
+            document: Parsed markdown document
+
+        Returns:
+            ContentValidationResult with any issues found
+        """
+        issues = []
+        sections_checked = 0
+
+        # Check each section that has content rules
+        for section_key, rules in self.content_rules.items():
+            sections_checked += 1
+
+            # Get section from document
+            section = self._get_section(document, section_key)
+
+            if not section:
+                # Section validator handles missing sections
+                continue
+
+            section_content = section.get('content', '')
+            section_name = section.get('name', section_key)
+
+            # Check required patterns
+            issues.extend(self._check_required_patterns(
+                section_name, section_content, rules
+            ))
+
+            # Check forbidden patterns
+            issues.extend(self._check_forbidden_patterns(
+                section_name, section_content, rules
+            ))
+
+            # Check discouraged patterns
+            issues.extend(self._check_discouraged_patterns(
+                section_name, section_content, rules
+            ))
+
+            # Check content quality metrics
+            issues.extend(self._check_quality_metrics(
+                section_name, section_content, rules
+            ))
+
+        return ContentValidationResult(
+            issues=issues,
+            sections_checked=sections_checked
+        )
+
+    def _get_section(self, document: 'MarkdownDocument',
+                     section_key: str) -> Optional[Dict[str, Any]]:
+        """
+        Get a section from the document.
+
+        Args:
+            document: Parsed markdown document
+            section_key: Section name (lowercase in rules, uppercase in document)
+
+        Returns:
+            Section dict with name and content, or None if not found
+        """
+        # Convert section_key to uppercase for matching
+        section_name = section_key.upper()
+
+        # Try to get section content
+        if hasattr(document, 'get_section'):
+            return document.get_section(section_name)
+
+        # Fallback: search headings
+        if hasattr(document, 'get_headings_by_level'):
+            headings = document.get_headings_by_level(2)
+            for heading in headings:
+                if isinstance(heading, dict):
+                    if heading.get('content', '').strip().upper() == section_name:
+                        # Found the section, need to extract content
+                        return {
+                            'name': section_name,
+                            'content': heading.get('text_content', '')
+                        }
+
+        return None
+
+    def _check_required_patterns(self, section_name: str, content: str,
+                                rules: Dict[str, Any]) -> List[ContentIssue]:
+        """Check that all required patterns appear in content."""
+        issues = []
+        required_patterns = rules.get('required_patterns', [])
+
+        for pattern in required_patterns:
+            try:
+                if not re.search(pattern, content, re.MULTILINE):
+                    issues.append(PatternMissing(
+                        section_name=section_name,
+                        severity='ERROR',
+                        message=f'Required pattern not found',
+                        pattern=pattern
+                    ))
+            except re.error as e:
+                # Invalid regex pattern in schema
+                issues.append(ContentIssue(
+                    section_name=section_name,
+                    severity='ERROR',
+                    message=f'Invalid regex pattern in schema: {e}'
+                ))
+
+        return issues
+
+    def _check_forbidden_patterns(self, section_name: str, content: str,
+                                  rules: Dict[str, Any]) -> List[ContentIssue]:
+        """Check that no forbidden patterns appear in content."""
+        issues = []
+        forbidden_patterns = rules.get('forbidden_patterns', [])
+
+        for pattern in forbidden_patterns:
+            try:
+                match = re.search(pattern, content, re.MULTILINE)
+                if match:
+                    issues.append(ForbiddenPattern(
+                        section_name=section_name,
+                        severity='ERROR',
+                        message=f'Forbidden pattern found',
+                        pattern=pattern,
+                        matched_text=match.group(0)[:50]  # Limit to 50 chars
+                    ))
+            except re.error as e:
+                issues.append(ContentIssue(
+                    section_name=section_name,
+                    severity='ERROR',
+                    message=f'Invalid regex pattern in schema: {e}'
+                ))
+
+        return issues
+
+    def _check_discouraged_patterns(self, section_name: str, content: str,
+                                   rules: Dict[str, Any]) -> List[ContentIssue]:
+        """Check for discouraged patterns (warnings)."""
+        issues = []
+        discouraged_patterns = rules.get('discouraged_patterns', [])
+
+        for pattern in discouraged_patterns:
+            try:
+                match = re.search(pattern, content, re.MULTILINE)
+                if match:
+                    issues.append(DiscouragedPattern(
+                        section_name=section_name,
+                        severity='WARNING',
+                        message=f'Discouraged pattern found',
+                        pattern=pattern,
+                        matched_text=match.group(0)[:50]
+                    ))
+            except re.error as e:
+                issues.append(ContentIssue(
+                    section_name=section_name,
+                    severity='WARNING',
+                    message=f'Invalid regex pattern in schema: {e}'
+                ))
+
+        return issues
+
+    def _check_quality_metrics(self, section_name: str, content: str,
+                              rules: Dict[str, Any]) -> List[ContentIssue]:
+        """Check content quality metrics (word count, sentence count)."""
+        issues = []
+        quality = rules.get('content_quality', {})
+
+        if not quality:
+            return issues
+
+        # Word count validation
+        word_count = len(content.split())
+
+        min_words = quality.get('min_words')
+        if min_words is not None and word_count < min_words:
+            issues.append(ContentTooShort(
+                section_name=section_name,
+                severity='WARNING',
+                message=f'Content too short ({word_count} words, minimum {min_words})',
+                actual=word_count,
+                required=min_words
+            ))
+
+        max_words = quality.get('max_words')
+        if max_words is not None and word_count > max_words:
+            issues.append(ContentTooLong(
+                section_name=section_name,
+                severity='WARNING',
+                message=f'Content too long ({word_count} words, maximum {max_words})',
+                actual=word_count,
+                limit=max_words
+            ))
+
+        # Sentence count validation
+        min_sentences = quality.get('min_sentences')
+        if min_sentences is not None:
+            # Simple sentence count (split by .!?)
+            sentence_count = len(re.findall(r'[.!?]+', content))
+
+            if sentence_count < min_sentences:
+                issues.append(ContentTooShort(
+                    section_name=section_name,
+                    severity='WARNING',
+                    message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
+                    actual=sentence_count,
+                    required=min_sentences
+                ))
+
+        return issues
--- a/markitect/validators/section_validator.py
+++ b/markitect/validators/section_validator.py
@@ -0,0 +1,226 @@
+"""
+Section Validator for markdown documents.
+
+Validates that document sections comply with x-markitect-sections classifications:
+- REQUIRED: Section must be present (ERROR if missing)
+- RECOMMENDED: Section should be present (WARNING if missing)
+- OPTIONAL: Section may be present (no check)
+- DISCOURAGED: Section should not be present (WARNING if present)
+- IMPROPER: Section must not be present (ERROR if present)
+"""
+
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+
+@dataclass
+class SectionIssue:
+    """Base class for section validation issues."""
+    section_name: str
+    severity: str  # 'ERROR', 'WARNING', 'INFO'
+    message: str
+    classification: str  # 'required', 'recommended', etc.
+    line_number: Optional[int] = None
+
+    def __str__(self) -> str:
+        location = f" (line {self.line_number})" if self.line_number else ""
+        return f"[{self.severity}]{location} {self.section_name}: {self.message}"
+
+
+@dataclass
+class SectionMissing(SectionIssue):
+    """Section is missing from document."""
+    pass
+
+
+@dataclass
+class SectionImproper(SectionIssue):
+    """Improper section found in document."""
+    pass
+
+
+@dataclass
+class SectionDiscouraged(SectionIssue):
+    """Discouraged section found in document."""
+    pass
+
+
+@dataclass
+class SectionValidationResult:
+    """Result of section validation."""
+    issues: List[SectionIssue]
+    sections_checked: int
+    sections_found: int
+
+    def has_errors(self) -> bool:
+        """Check if there are any ERROR-level issues."""
+        return any(issue.severity == 'ERROR' for issue in self.issues)
+
+    def has_warnings(self) -> bool:
+        """Check if there are any WARNING-level issues."""
+        return any(issue.severity == 'WARNING' for issue in self.issues)
+
+    def is_valid(self) -> bool:
+        """Check if validation passed (no errors)."""
+        return not self.has_errors()
+
+    def get_errors(self) -> List[SectionIssue]:
+        """Get all ERROR-level issues."""
+        return [issue for issue in self.issues if issue.severity == 'ERROR']
+
+    def get_warnings(self) -> List[SectionIssue]:
+        """Get all WARNING-level issues."""
+        return [issue for issue in self.issues if issue.severity == 'WARNING']
+
+
+class SectionValidator:
+    """
+    Validates section presence and classification compliance.
+
+    Checks that markdown documents have the correct sections based on
+    x-markitect-sections classifications in the schema.
+    """
+
+    def __init__(self, schema: Dict[str, Any]):
+        """
+        Initialize validator with a schema.
+
+        Args:
+            schema: JSON schema with x-markitect-sections extension
+        """
+        self.schema = schema
+        self.sections_spec = schema.get('x-markitect-sections', {})
+
+    def check(self, document: 'MarkdownDocument') -> SectionValidationResult:
+        """
+        Validate section presence against schema classifications.
+
+        Args:
+            document: Parsed markdown document
+
+        Returns:
+            SectionValidationResult with any issues found
+        """
+        issues = []
+
+        # Get level-2 headings (main sections) from document
+        doc_sections = self._get_document_sections(document)
+
+        # Check each specification
+        for section_name, spec in self.sections_spec.items():
+            classification = spec.get('classification')
+            section_in_doc = self._find_section(section_name, doc_sections, spec)
+
+            if classification == 'required':
+                if not section_in_doc:
+                    issues.append(SectionMissing(
+                        section_name=section_name,
+                        severity='ERROR',
+                        message=spec.get('error_message', f'{section_name} section is required'),
+                        classification='required'
+                    ))
+
+            elif classification == 'improper':
+                if section_in_doc:
+                    issues.append(SectionImproper(
+                        section_name=section_name,
+                        severity='ERROR',
+                        message=spec.get('error_message', f'{section_name} section must not appear'),
+                        classification='improper',
+                        line_number=section_in_doc.get('line_number')
+                    ))
+
+            elif classification == 'recommended':
+                if not section_in_doc:
+                    issues.append(SectionMissing(
+                        section_name=section_name,
+                        severity='WARNING',
+                        message=spec.get('warning_if_missing', f'{section_name} section is recommended'),
+                        classification='recommended'
+                    ))
+
+            elif classification == 'discouraged':
+                if section_in_doc:
+                    issues.append(SectionDiscouraged(
+                        section_name=section_name,
+                        severity='WARNING',
+                        message=spec.get('warning_if_present', f'{section_name} section is discouraged'),
+                        classification='discouraged',
+                        line_number=section_in_doc.get('line_number')
+                    ))
+
+        return SectionValidationResult(
+            issues=issues,
+            sections_checked=len(self.sections_spec),
+            sections_found=len(doc_sections)
+        )
+
+    def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
+        """
+        Extract level-2 headings from document.
+
+        Args:
+            document: Parsed markdown document
+
+        Returns:
+            List of section dicts with name and line_number
+        """
+        sections = []
+
+        # Get headings from document
+        if hasattr(document, 'get_headings_by_level'):
+            level_2_headings = document.get_headings_by_level(2)
+        elif hasattr(document, 'headings'):
+            level_2_headings = [
+                h for h in document.headings
+                if h.get('level') == 2
+            ]
+        else:
+            # Fallback: parse from AST
+            level_2_headings = []
+
+        for heading in level_2_headings:
+            if isinstance(heading, dict):
+                sections.append({
+                    'name': heading.get('content', '').strip().upper(),
+                    'line_number': heading.get('line_number')
+                })
+            elif isinstance(heading, str):
+                sections.append({
+                    'name': heading.strip().upper(),
+                    'line_number': None
+                })
+
+        return sections
+
+    def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]],
+                     spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        Find a section in document, checking alternatives.
+
+        Args:
+            section_name: Primary section name to find
+            doc_sections: List of sections in document
+            spec: Section specification with potential alternatives
+
+        Returns:
+            Section dict if found, None otherwise
+        """
+        # Normalize section name for comparison
+        normalized_name = section_name.upper().strip()
+
+        # Check primary name
+        for section in doc_sections:
+            if section['name'] == normalized_name:
+                return section
+
+        # Check alternatives
+        alternatives = spec.get('alternatives', [])
+        for alt_name in alternatives:
+            normalized_alt = alt_name.upper().strip()
+            for section in doc_sections:
+                if section['name'] == normalized_alt:
+                    return section
+
+        return None