Files
markitect-main/markitect/validators/content_validator.py
tegwick a969c5de47 feat: add semantic document validator for x-markitect extensions
Implements semantic validation to complement existing structural validation:

Phase 1 & 2 Complete:
- SemanticValidator: Main validator orchestrating sub-validators
- SectionValidator: Enforces section classifications (required, recommended,
  optional, discouraged, improper) from x-markitect-sections
- ContentValidator: Validates content patterns, forbidden patterns, and
  quality metrics (word counts, sentence counts) from x-markitect-content-control

Features:
- Pattern matching with regex for required/forbidden/discouraged patterns
- Word count and sentence count validation
- Detailed error reporting with severity levels (ERROR, WARNING)
- Support for section alternatives (e.g., FLAGS vs OPTIONS)
- Comprehensive test coverage (16 tests, 100% passing)

Architecture:
- Complements existing SchemaValidator (structural AST validation)
- Clean separation: validators/ package for modular validators
- Semantic validation focuses on x-markitect-* extensions
- LinkValidator planned for Phase 3 (optional --check-links)

Next: Phase 4 - CLI integration to enhance 'markitect validate' command

Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:24:32 +01:00

317 lines
11 KiB
Python

"""
Content Validator for markdown documents.
Validates content against x-markitect-content-control rules:
- Required patterns: Regex patterns that must appear in content
- Discouraged patterns: Patterns that should be avoided (warnings)
- Forbidden patterns: Patterns that must not appear (errors)
- Quality metrics: Word counts, sentence counts, readability
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import re
@dataclass
class ContentIssue:
"""Base class for content validation issues."""
section_name: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
line_number: Optional[int] = None
matched_text: Optional[str] = None
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
match_info = f": '{self.matched_text}'" if self.matched_text else ""
return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"
@dataclass
class PatternMissing(ContentIssue):
"""Required pattern not found in content."""
pattern: str = ""
@dataclass
class ForbiddenPattern(ContentIssue):
"""Forbidden pattern found in content."""
pattern: str = ""
@dataclass
class DiscouragedPattern(ContentIssue):
"""Discouraged pattern found in content."""
pattern: str = ""
@dataclass
class ContentTooShort(ContentIssue):
"""Content does not meet minimum word/sentence count."""
actual: int = 0
required: int = 0
@dataclass
class ContentTooLong(ContentIssue):
"""Content exceeds maximum word/sentence count."""
actual: int = 0
limit: int = 0
@dataclass
class ContentValidationResult:
"""Result of content validation."""
issues: List[ContentIssue]
sections_checked: int
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[ContentIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[ContentIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class ContentValidator:
"""
Validates content against x-markitect-content-control rules.
Checks content patterns, quality metrics, and readability for each section.
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-content-control extension
"""
self.schema = schema
self.content_rules = schema.get('x-markitect-content-control', {})
def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
"""
Validate content against schema rules.
Args:
document: Parsed markdown document
Returns:
ContentValidationResult with any issues found
"""
issues = []
sections_checked = 0
# Check each section that has content rules
for section_key, rules in self.content_rules.items():
sections_checked += 1
# Get section from document
section = self._get_section(document, section_key)
if not section:
# Section validator handles missing sections
continue
section_content = section.get('content', '')
section_name = section.get('name', section_key)
# Check required patterns
issues.extend(self._check_required_patterns(
section_name, section_content, rules
))
# Check forbidden patterns
issues.extend(self._check_forbidden_patterns(
section_name, section_content, rules
))
# Check discouraged patterns
issues.extend(self._check_discouraged_patterns(
section_name, section_content, rules
))
# Check content quality metrics
issues.extend(self._check_quality_metrics(
section_name, section_content, rules
))
return ContentValidationResult(
issues=issues,
sections_checked=sections_checked
)
def _get_section(self, document: 'MarkdownDocument',
section_key: str) -> Optional[Dict[str, Any]]:
"""
Get a section from the document.
Args:
document: Parsed markdown document
section_key: Section name (lowercase in rules, uppercase in document)
Returns:
Section dict with name and content, or None if not found
"""
# Convert section_key to uppercase for matching
section_name = section_key.upper()
# Try to get section content
if hasattr(document, 'get_section'):
return document.get_section(section_name)
# Fallback: search headings
if hasattr(document, 'get_headings_by_level'):
headings = document.get_headings_by_level(2)
for heading in headings:
if isinstance(heading, dict):
if heading.get('content', '').strip().upper() == section_name:
# Found the section, need to extract content
return {
'name': section_name,
'content': heading.get('text_content', '')
}
return None
def _check_required_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check that all required patterns appear in content."""
issues = []
required_patterns = rules.get('required_patterns', [])
for pattern in required_patterns:
try:
if not re.search(pattern, content, re.MULTILINE):
issues.append(PatternMissing(
section_name=section_name,
severity='ERROR',
message=f'Required pattern not found',
pattern=pattern
))
except re.error as e:
# Invalid regex pattern in schema
issues.append(ContentIssue(
section_name=section_name,
severity='ERROR',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_forbidden_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check that no forbidden patterns appear in content."""
issues = []
forbidden_patterns = rules.get('forbidden_patterns', [])
for pattern in forbidden_patterns:
try:
match = re.search(pattern, content, re.MULTILINE)
if match:
issues.append(ForbiddenPattern(
section_name=section_name,
severity='ERROR',
message=f'Forbidden pattern found',
pattern=pattern,
matched_text=match.group(0)[:50] # Limit to 50 chars
))
except re.error as e:
issues.append(ContentIssue(
section_name=section_name,
severity='ERROR',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_discouraged_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check for discouraged patterns (warnings)."""
issues = []
discouraged_patterns = rules.get('discouraged_patterns', [])
for pattern in discouraged_patterns:
try:
match = re.search(pattern, content, re.MULTILINE)
if match:
issues.append(DiscouragedPattern(
section_name=section_name,
severity='WARNING',
message=f'Discouraged pattern found',
pattern=pattern,
matched_text=match.group(0)[:50]
))
except re.error as e:
issues.append(ContentIssue(
section_name=section_name,
severity='WARNING',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_quality_metrics(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check content quality metrics (word count, sentence count)."""
issues = []
quality = rules.get('content_quality', {})
if not quality:
return issues
# Word count validation
word_count = len(content.split())
min_words = quality.get('min_words')
if min_words is not None and word_count < min_words:
issues.append(ContentTooShort(
section_name=section_name,
severity='WARNING',
message=f'Content too short ({word_count} words, minimum {min_words})',
actual=word_count,
required=min_words
))
max_words = quality.get('max_words')
if max_words is not None and word_count > max_words:
issues.append(ContentTooLong(
section_name=section_name,
severity='WARNING',
message=f'Content too long ({word_count} words, maximum {max_words})',
actual=word_count,
limit=max_words
))
# Sentence count validation
min_sentences = quality.get('min_sentences')
if min_sentences is not None:
# Simple sentence count (split by .!?)
sentence_count = len(re.findall(r'[.!?]+', content))
if sentence_count < min_sentences:
issues.append(ContentTooShort(
section_name=section_name,
severity='WARNING',
message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
actual=sentence_count,
required=min_sentences
))
return issues