Implements semantic validation to complement existing structural validation: Phase 1 & 2 Complete: - SemanticValidator: Main validator orchestrating sub-validators - SectionValidator: Enforces section classifications (required, recommended, optional, discouraged, improper) from x-markitect-sections - ContentValidator: Validates content patterns, forbidden patterns, and quality metrics (word counts, sentence counts) from x-markitect-content-control Features: - Pattern matching with regex for required/forbidden/discouraged patterns - Word count and sentence count validation - Detailed error reporting with severity levels (ERROR, WARNING) - Support for section alternatives (e.g., FLAGS vs OPTIONS) - Comprehensive test coverage (16 tests, 100% passing) Architecture: - Complements existing SchemaValidator (structural AST validation) - Clean separation: validators/ package for modular validators - Semantic validation focuses on x-markitect-* extensions - LinkValidator planned for Phase 3 (optional --check-links) Next: Phase 4 - CLI integration to enhance 'markitect validate' command Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
317 lines
11 KiB
Python
317 lines
11 KiB
Python
"""
|
|
Content Validator for markdown documents.
|
|
|
|
Validates content against x-markitect-content-control rules:
|
|
- Required patterns: Regex patterns that must appear in content
|
|
- Discouraged patterns: Patterns that should be avoided (warnings)
|
|
- Forbidden patterns: Patterns that must not appear (errors)
|
|
- Quality metrics: Word counts, sentence counts, readability
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import List, Dict, Any, Optional
|
|
import re
|
|
|
|
|
|
@dataclass
|
|
class ContentIssue:
|
|
"""Base class for content validation issues."""
|
|
section_name: str
|
|
severity: str # 'ERROR', 'WARNING', 'INFO'
|
|
message: str
|
|
line_number: Optional[int] = None
|
|
matched_text: Optional[str] = None
|
|
|
|
def __str__(self) -> str:
|
|
location = f" (line {self.line_number})" if self.line_number else ""
|
|
match_info = f": '{self.matched_text}'" if self.matched_text else ""
|
|
return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"
|
|
|
|
|
|
@dataclass
|
|
class PatternMissing(ContentIssue):
|
|
"""Required pattern not found in content."""
|
|
pattern: str = ""
|
|
|
|
|
|
@dataclass
|
|
class ForbiddenPattern(ContentIssue):
|
|
"""Forbidden pattern found in content."""
|
|
pattern: str = ""
|
|
|
|
|
|
@dataclass
|
|
class DiscouragedPattern(ContentIssue):
|
|
"""Discouraged pattern found in content."""
|
|
pattern: str = ""
|
|
|
|
|
|
@dataclass
|
|
class ContentTooShort(ContentIssue):
|
|
"""Content does not meet minimum word/sentence count."""
|
|
actual: int = 0
|
|
required: int = 0
|
|
|
|
|
|
@dataclass
|
|
class ContentTooLong(ContentIssue):
|
|
"""Content exceeds maximum word/sentence count."""
|
|
actual: int = 0
|
|
limit: int = 0
|
|
|
|
|
|
@dataclass
|
|
class ContentValidationResult:
|
|
"""Result of content validation."""
|
|
issues: List[ContentIssue]
|
|
sections_checked: int
|
|
|
|
def has_errors(self) -> bool:
|
|
"""Check if there are any ERROR-level issues."""
|
|
return any(issue.severity == 'ERROR' for issue in self.issues)
|
|
|
|
def has_warnings(self) -> bool:
|
|
"""Check if there are any WARNING-level issues."""
|
|
return any(issue.severity == 'WARNING' for issue in self.issues)
|
|
|
|
def is_valid(self) -> bool:
|
|
"""Check if validation passed (no errors)."""
|
|
return not self.has_errors()
|
|
|
|
def get_errors(self) -> List[ContentIssue]:
|
|
"""Get all ERROR-level issues."""
|
|
return [issue for issue in self.issues if issue.severity == 'ERROR']
|
|
|
|
def get_warnings(self) -> List[ContentIssue]:
|
|
"""Get all WARNING-level issues."""
|
|
return [issue for issue in self.issues if issue.severity == 'WARNING']
|
|
|
|
|
|
class ContentValidator:
|
|
"""
|
|
Validates content against x-markitect-content-control rules.
|
|
|
|
Checks content patterns, quality metrics, and readability for each section.
|
|
"""
|
|
|
|
def __init__(self, schema: Dict[str, Any]):
|
|
"""
|
|
Initialize validator with a schema.
|
|
|
|
Args:
|
|
schema: JSON schema with x-markitect-content-control extension
|
|
"""
|
|
self.schema = schema
|
|
self.content_rules = schema.get('x-markitect-content-control', {})
|
|
|
|
def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
|
|
"""
|
|
Validate content against schema rules.
|
|
|
|
Args:
|
|
document: Parsed markdown document
|
|
|
|
Returns:
|
|
ContentValidationResult with any issues found
|
|
"""
|
|
issues = []
|
|
sections_checked = 0
|
|
|
|
# Check each section that has content rules
|
|
for section_key, rules in self.content_rules.items():
|
|
sections_checked += 1
|
|
|
|
# Get section from document
|
|
section = self._get_section(document, section_key)
|
|
|
|
if not section:
|
|
# Section validator handles missing sections
|
|
continue
|
|
|
|
section_content = section.get('content', '')
|
|
section_name = section.get('name', section_key)
|
|
|
|
# Check required patterns
|
|
issues.extend(self._check_required_patterns(
|
|
section_name, section_content, rules
|
|
))
|
|
|
|
# Check forbidden patterns
|
|
issues.extend(self._check_forbidden_patterns(
|
|
section_name, section_content, rules
|
|
))
|
|
|
|
# Check discouraged patterns
|
|
issues.extend(self._check_discouraged_patterns(
|
|
section_name, section_content, rules
|
|
))
|
|
|
|
# Check content quality metrics
|
|
issues.extend(self._check_quality_metrics(
|
|
section_name, section_content, rules
|
|
))
|
|
|
|
return ContentValidationResult(
|
|
issues=issues,
|
|
sections_checked=sections_checked
|
|
)
|
|
|
|
def _get_section(self, document: 'MarkdownDocument',
|
|
section_key: str) -> Optional[Dict[str, Any]]:
|
|
"""
|
|
Get a section from the document.
|
|
|
|
Args:
|
|
document: Parsed markdown document
|
|
section_key: Section name (lowercase in rules, uppercase in document)
|
|
|
|
Returns:
|
|
Section dict with name and content, or None if not found
|
|
"""
|
|
# Convert section_key to uppercase for matching
|
|
section_name = section_key.upper()
|
|
|
|
# Try to get section content
|
|
if hasattr(document, 'get_section'):
|
|
return document.get_section(section_name)
|
|
|
|
# Fallback: search headings
|
|
if hasattr(document, 'get_headings_by_level'):
|
|
headings = document.get_headings_by_level(2)
|
|
for heading in headings:
|
|
if isinstance(heading, dict):
|
|
if heading.get('content', '').strip().upper() == section_name:
|
|
# Found the section, need to extract content
|
|
return {
|
|
'name': section_name,
|
|
'content': heading.get('text_content', '')
|
|
}
|
|
|
|
return None
|
|
|
|
def _check_required_patterns(self, section_name: str, content: str,
|
|
rules: Dict[str, Any]) -> List[ContentIssue]:
|
|
"""Check that all required patterns appear in content."""
|
|
issues = []
|
|
required_patterns = rules.get('required_patterns', [])
|
|
|
|
for pattern in required_patterns:
|
|
try:
|
|
if not re.search(pattern, content, re.MULTILINE):
|
|
issues.append(PatternMissing(
|
|
section_name=section_name,
|
|
severity='ERROR',
|
|
message=f'Required pattern not found',
|
|
pattern=pattern
|
|
))
|
|
except re.error as e:
|
|
# Invalid regex pattern in schema
|
|
issues.append(ContentIssue(
|
|
section_name=section_name,
|
|
severity='ERROR',
|
|
message=f'Invalid regex pattern in schema: {e}'
|
|
))
|
|
|
|
return issues
|
|
|
|
def _check_forbidden_patterns(self, section_name: str, content: str,
|
|
rules: Dict[str, Any]) -> List[ContentIssue]:
|
|
"""Check that no forbidden patterns appear in content."""
|
|
issues = []
|
|
forbidden_patterns = rules.get('forbidden_patterns', [])
|
|
|
|
for pattern in forbidden_patterns:
|
|
try:
|
|
match = re.search(pattern, content, re.MULTILINE)
|
|
if match:
|
|
issues.append(ForbiddenPattern(
|
|
section_name=section_name,
|
|
severity='ERROR',
|
|
message=f'Forbidden pattern found',
|
|
pattern=pattern,
|
|
matched_text=match.group(0)[:50] # Limit to 50 chars
|
|
))
|
|
except re.error as e:
|
|
issues.append(ContentIssue(
|
|
section_name=section_name,
|
|
severity='ERROR',
|
|
message=f'Invalid regex pattern in schema: {e}'
|
|
))
|
|
|
|
return issues
|
|
|
|
def _check_discouraged_patterns(self, section_name: str, content: str,
|
|
rules: Dict[str, Any]) -> List[ContentIssue]:
|
|
"""Check for discouraged patterns (warnings)."""
|
|
issues = []
|
|
discouraged_patterns = rules.get('discouraged_patterns', [])
|
|
|
|
for pattern in discouraged_patterns:
|
|
try:
|
|
match = re.search(pattern, content, re.MULTILINE)
|
|
if match:
|
|
issues.append(DiscouragedPattern(
|
|
section_name=section_name,
|
|
severity='WARNING',
|
|
message=f'Discouraged pattern found',
|
|
pattern=pattern,
|
|
matched_text=match.group(0)[:50]
|
|
))
|
|
except re.error as e:
|
|
issues.append(ContentIssue(
|
|
section_name=section_name,
|
|
severity='WARNING',
|
|
message=f'Invalid regex pattern in schema: {e}'
|
|
))
|
|
|
|
return issues
|
|
|
|
def _check_quality_metrics(self, section_name: str, content: str,
|
|
rules: Dict[str, Any]) -> List[ContentIssue]:
|
|
"""Check content quality metrics (word count, sentence count)."""
|
|
issues = []
|
|
quality = rules.get('content_quality', {})
|
|
|
|
if not quality:
|
|
return issues
|
|
|
|
# Word count validation
|
|
word_count = len(content.split())
|
|
|
|
min_words = quality.get('min_words')
|
|
if min_words is not None and word_count < min_words:
|
|
issues.append(ContentTooShort(
|
|
section_name=section_name,
|
|
severity='WARNING',
|
|
message=f'Content too short ({word_count} words, minimum {min_words})',
|
|
actual=word_count,
|
|
required=min_words
|
|
))
|
|
|
|
max_words = quality.get('max_words')
|
|
if max_words is not None and word_count > max_words:
|
|
issues.append(ContentTooLong(
|
|
section_name=section_name,
|
|
severity='WARNING',
|
|
message=f'Content too long ({word_count} words, maximum {max_words})',
|
|
actual=word_count,
|
|
limit=max_words
|
|
))
|
|
|
|
# Sentence count validation
|
|
min_sentences = quality.get('min_sentences')
|
|
if min_sentences is not None:
|
|
# Simple sentence count (split by .!?)
|
|
sentence_count = len(re.findall(r'[.!?]+', content))
|
|
|
|
if sentence_count < min_sentences:
|
|
issues.append(ContentTooShort(
|
|
section_name=section_name,
|
|
severity='WARNING',
|
|
message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
|
|
actual=sentence_count,
|
|
required=min_sentences
|
|
))
|
|
|
|
return issues
|