feat: add semantic document validator for x-markitect extensions

Implements semantic validation to complement existing structural validation: Phase 1 & 2 Complete: - SemanticValidator: Main validator orchestrating sub-validators - SectionValidator: Enforces section classifications (required, recommended, optional, discouraged, improper) from x-markitect-sections - ContentValidator: Validates content patterns, forbidden patterns, and quality metrics (word counts, sentence counts) from x-markitect-content-control Features: - Pattern matching with regex for required/forbidden/discouraged patterns - Word count and sentence count validation - Detailed error reporting with severity levels (ERROR, WARNING) - Support for section alternatives (e.g., FLAGS vs OPTIONS) - Comprehensive test coverage (16 tests, 100% passing) Architecture: - Complements existing SchemaValidator (structural AST validation) - Clean separation: validators/ package for modular validators - Semantic validation focuses on x-markitect-* extensions - LinkValidator planned for Phase 3 (optional --check-links) Next: Phase 4 - CLI integration to enhance 'markitect validate' command Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:24:32 +01:00
parent f27eea6b5b
commit a969c5de47
6 changed files with 1932 additions and 0 deletions
--- a/markitect/validators/section_validator.py
+++ b/markitect/validators/section_validator.py
@@ -0,0 +1,226 @@
+"""
+Section Validator for markdown documents.
+
+Validates that document sections comply with x-markitect-sections classifications:
+- REQUIRED: Section must be present (ERROR if missing)
+- RECOMMENDED: Section should be present (WARNING if missing)
+- OPTIONAL: Section may be present (no check)
+- DISCOURAGED: Section should not be present (WARNING if present)
+- IMPROPER: Section must not be present (ERROR if present)
+"""
+
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+
+@dataclass
+class SectionIssue:
+    """Base class for section validation issues."""
+    section_name: str
+    severity: str  # 'ERROR', 'WARNING', 'INFO'
+    message: str
+    classification: str  # 'required', 'recommended', etc.
+    line_number: Optional[int] = None
+
+    def __str__(self) -> str:
+        location = f" (line {self.line_number})" if self.line_number else ""
+        return f"[{self.severity}]{location} {self.section_name}: {self.message}"
+
+
+@dataclass
+class SectionMissing(SectionIssue):
+    """Section is missing from document."""
+    pass
+
+
+@dataclass
+class SectionImproper(SectionIssue):
+    """Improper section found in document."""
+    pass
+
+
+@dataclass
+class SectionDiscouraged(SectionIssue):
+    """Discouraged section found in document."""
+    pass
+
+
+@dataclass
+class SectionValidationResult:
+    """Result of section validation."""
+    issues: List[SectionIssue]
+    sections_checked: int
+    sections_found: int
+
+    def has_errors(self) -> bool:
+        """Check if there are any ERROR-level issues."""
+        return any(issue.severity == 'ERROR' for issue in self.issues)
+
+    def has_warnings(self) -> bool:
+        """Check if there are any WARNING-level issues."""
+        return any(issue.severity == 'WARNING' for issue in self.issues)
+
+    def is_valid(self) -> bool:
+        """Check if validation passed (no errors)."""
+        return not self.has_errors()
+
+    def get_errors(self) -> List[SectionIssue]:
+        """Get all ERROR-level issues."""
+        return [issue for issue in self.issues if issue.severity == 'ERROR']
+
+    def get_warnings(self) -> List[SectionIssue]:
+        """Get all WARNING-level issues."""
+        return [issue for issue in self.issues if issue.severity == 'WARNING']
+
+
+class SectionValidator:
+    """
+    Validates section presence and classification compliance.
+
+    Checks that markdown documents have the correct sections based on
+    x-markitect-sections classifications in the schema.
+    """
+
+    def __init__(self, schema: Dict[str, Any]):
+        """
+        Initialize validator with a schema.
+
+        Args:
+            schema: JSON schema with x-markitect-sections extension
+        """
+        self.schema = schema
+        self.sections_spec = schema.get('x-markitect-sections', {})
+
+    def check(self, document: 'MarkdownDocument') -> SectionValidationResult:
+        """
+        Validate section presence against schema classifications.
+
+        Args:
+            document: Parsed markdown document
+
+        Returns:
+            SectionValidationResult with any issues found
+        """
+        issues = []
+
+        # Get level-2 headings (main sections) from document
+        doc_sections = self._get_document_sections(document)
+
+        # Check each specification
+        for section_name, spec in self.sections_spec.items():
+            classification = spec.get('classification')
+            section_in_doc = self._find_section(section_name, doc_sections, spec)
+
+            if classification == 'required':
+                if not section_in_doc:
+                    issues.append(SectionMissing(
+                        section_name=section_name,
+                        severity='ERROR',
+                        message=spec.get('error_message', f'{section_name} section is required'),
+                        classification='required'
+                    ))
+
+            elif classification == 'improper':
+                if section_in_doc:
+                    issues.append(SectionImproper(
+                        section_name=section_name,
+                        severity='ERROR',
+                        message=spec.get('error_message', f'{section_name} section must not appear'),
+                        classification='improper',
+                        line_number=section_in_doc.get('line_number')
+                    ))
+
+            elif classification == 'recommended':
+                if not section_in_doc:
+                    issues.append(SectionMissing(
+                        section_name=section_name,
+                        severity='WARNING',
+                        message=spec.get('warning_if_missing', f'{section_name} section is recommended'),
+                        classification='recommended'
+                    ))
+
+            elif classification == 'discouraged':
+                if section_in_doc:
+                    issues.append(SectionDiscouraged(
+                        section_name=section_name,
+                        severity='WARNING',
+                        message=spec.get('warning_if_present', f'{section_name} section is discouraged'),
+                        classification='discouraged',
+                        line_number=section_in_doc.get('line_number')
+                    ))
+
+        return SectionValidationResult(
+            issues=issues,
+            sections_checked=len(self.sections_spec),
+            sections_found=len(doc_sections)
+        )
+
+    def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
+        """
+        Extract level-2 headings from document.
+
+        Args:
+            document: Parsed markdown document
+
+        Returns:
+            List of section dicts with name and line_number
+        """
+        sections = []
+
+        # Get headings from document
+        if hasattr(document, 'get_headings_by_level'):
+            level_2_headings = document.get_headings_by_level(2)
+        elif hasattr(document, 'headings'):
+            level_2_headings = [
+                h for h in document.headings
+                if h.get('level') == 2
+            ]
+        else:
+            # Fallback: parse from AST
+            level_2_headings = []
+
+        for heading in level_2_headings:
+            if isinstance(heading, dict):
+                sections.append({
+                    'name': heading.get('content', '').strip().upper(),
+                    'line_number': heading.get('line_number')
+                })
+            elif isinstance(heading, str):
+                sections.append({
+                    'name': heading.strip().upper(),
+                    'line_number': None
+                })
+
+        return sections
+
+    def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]],
+                     spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """
+        Find a section in document, checking alternatives.
+
+        Args:
+            section_name: Primary section name to find
+            doc_sections: List of sections in document
+            spec: Section specification with potential alternatives
+
+        Returns:
+            Section dict if found, None otherwise
+        """
+        # Normalize section name for comparison
+        normalized_name = section_name.upper().strip()
+
+        # Check primary name
+        for section in doc_sections:
+            if section['name'] == normalized_name:
+                return section
+
+        # Check alternatives
+        alternatives = spec.get('alternatives', [])
+        for alt_name in alternatives:
+            normalized_alt = alt_name.upper().strip()
+            for section in doc_sections:
+                if section['name'] == normalized_alt:
+                    return section
+
+        return None