Files
markitect-main/markitect/validators/section_validator.py
tegwick a969c5de47 feat: add semantic document validator for x-markitect extensions
Implements semantic validation to complement existing structural validation:

Phase 1 & 2 Complete:
- SemanticValidator: Main validator orchestrating sub-validators
- SectionValidator: Enforces section classifications (required, recommended,
  optional, discouraged, improper) from x-markitect-sections
- ContentValidator: Validates content patterns, forbidden patterns, and
  quality metrics (word counts, sentence counts) from x-markitect-content-control

Features:
- Pattern matching with regex for required/forbidden/discouraged patterns
- Word count and sentence count validation
- Detailed error reporting with severity levels (ERROR, WARNING)
- Support for section alternatives (e.g., FLAGS vs OPTIONS)
- Comprehensive test coverage (16 tests, 100% passing)

Architecture:
- Complements existing SchemaValidator (structural AST validation)
- Clean separation: validators/ package for modular validators
- Semantic validation focuses on x-markitect-* extensions
- LinkValidator planned for Phase 3 (optional --check-links)

Next: Phase 4 - CLI integration to enhance 'markitect validate' command

Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:24:32 +01:00

227 lines
7.6 KiB
Python

"""
Section Validator for markdown documents.
Validates that document sections comply with x-markitect-sections classifications:
- REQUIRED: Section must be present (ERROR if missing)
- RECOMMENDED: Section should be present (WARNING if missing)
- OPTIONAL: Section may be present (no check)
- DISCOURAGED: Section should not be present (WARNING if present)
- IMPROPER: Section must not be present (ERROR if present)
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
@dataclass
class SectionIssue:
"""Base class for section validation issues."""
section_name: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
classification: str # 'required', 'recommended', etc.
line_number: Optional[int] = None
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
return f"[{self.severity}]{location} {self.section_name}: {self.message}"
@dataclass
class SectionMissing(SectionIssue):
"""Section is missing from document."""
pass
@dataclass
class SectionImproper(SectionIssue):
"""Improper section found in document."""
pass
@dataclass
class SectionDiscouraged(SectionIssue):
"""Discouraged section found in document."""
pass
@dataclass
class SectionValidationResult:
"""Result of section validation."""
issues: List[SectionIssue]
sections_checked: int
sections_found: int
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[SectionIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[SectionIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class SectionValidator:
"""
Validates section presence and classification compliance.
Checks that markdown documents have the correct sections based on
x-markitect-sections classifications in the schema.
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-sections extension
"""
self.schema = schema
self.sections_spec = schema.get('x-markitect-sections', {})
def check(self, document: 'MarkdownDocument') -> SectionValidationResult:
"""
Validate section presence against schema classifications.
Args:
document: Parsed markdown document
Returns:
SectionValidationResult with any issues found
"""
issues = []
# Get level-2 headings (main sections) from document
doc_sections = self._get_document_sections(document)
# Check each specification
for section_name, spec in self.sections_spec.items():
classification = spec.get('classification')
section_in_doc = self._find_section(section_name, doc_sections, spec)
if classification == 'required':
if not section_in_doc:
issues.append(SectionMissing(
section_name=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} section is required'),
classification='required'
))
elif classification == 'improper':
if section_in_doc:
issues.append(SectionImproper(
section_name=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} section must not appear'),
classification='improper',
line_number=section_in_doc.get('line_number')
))
elif classification == 'recommended':
if not section_in_doc:
issues.append(SectionMissing(
section_name=section_name,
severity='WARNING',
message=spec.get('warning_if_missing', f'{section_name} section is recommended'),
classification='recommended'
))
elif classification == 'discouraged':
if section_in_doc:
issues.append(SectionDiscouraged(
section_name=section_name,
severity='WARNING',
message=spec.get('warning_if_present', f'{section_name} section is discouraged'),
classification='discouraged',
line_number=section_in_doc.get('line_number')
))
return SectionValidationResult(
issues=issues,
sections_checked=len(self.sections_spec),
sections_found=len(doc_sections)
)
def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
"""
Extract level-2 headings from document.
Args:
document: Parsed markdown document
Returns:
List of section dicts with name and line_number
"""
sections = []
# Get headings from document
if hasattr(document, 'get_headings_by_level'):
level_2_headings = document.get_headings_by_level(2)
elif hasattr(document, 'headings'):
level_2_headings = [
h for h in document.headings
if h.get('level') == 2
]
else:
# Fallback: parse from AST
level_2_headings = []
for heading in level_2_headings:
if isinstance(heading, dict):
sections.append({
'name': heading.get('content', '').strip().upper(),
'line_number': heading.get('line_number')
})
elif isinstance(heading, str):
sections.append({
'name': heading.strip().upper(),
'line_number': None
})
return sections
def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]],
spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Find a section in document, checking alternatives.
Args:
section_name: Primary section name to find
doc_sections: List of sections in document
spec: Section specification with potential alternatives
Returns:
Section dict if found, None otherwise
"""
# Normalize section name for comparison
normalized_name = section_name.upper().strip()
# Check primary name
for section in doc_sections:
if section['name'] == normalized_name:
return section
# Check alternatives
alternatives = spec.get('alternatives', [])
for alt_name in alternatives:
normalized_alt = alt_name.upper().strip()
for section in doc_sections:
if section['name'] == normalized_alt:
return section
return None