""" Content Validator for markdown documents. Validates content against x-markitect-content-control rules: - Required patterns: Regex patterns that must appear in content - Discouraged patterns: Patterns that should be avoided (warnings) - Forbidden patterns: Patterns that must not appear (errors) - Quality metrics: Word counts, sentence counts, readability """ from dataclasses import dataclass from typing import List, Dict, Any, Optional import re @dataclass class ContentIssue: """Base class for content validation issues.""" section_name: str severity: str # 'ERROR', 'WARNING', 'INFO' message: str line_number: Optional[int] = None matched_text: Optional[str] = None def __str__(self) -> str: location = f" (line {self.line_number})" if self.line_number else "" match_info = f": '{self.matched_text}'" if self.matched_text else "" return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}" @dataclass class PatternMissing(ContentIssue): """Required pattern not found in content.""" pattern: str = "" @dataclass class ForbiddenPattern(ContentIssue): """Forbidden pattern found in content.""" pattern: str = "" @dataclass class DiscouragedPattern(ContentIssue): """Discouraged pattern found in content.""" pattern: str = "" @dataclass class ContentTooShort(ContentIssue): """Content does not meet minimum word/sentence count.""" actual: int = 0 required: int = 0 @dataclass class ContentTooLong(ContentIssue): """Content exceeds maximum word/sentence count.""" actual: int = 0 limit: int = 0 @dataclass class ContentValidationResult: """Result of content validation.""" issues: List[ContentIssue] sections_checked: int def has_errors(self) -> bool: """Check if there are any ERROR-level issues.""" return any(issue.severity == 'ERROR' for issue in self.issues) def has_warnings(self) -> bool: """Check if there are any WARNING-level issues.""" return any(issue.severity == 'WARNING' for issue in self.issues) def is_valid(self) -> bool: """Check if validation passed (no errors).""" return not self.has_errors() def get_errors(self) -> List[ContentIssue]: """Get all ERROR-level issues.""" return [issue for issue in self.issues if issue.severity == 'ERROR'] def get_warnings(self) -> List[ContentIssue]: """Get all WARNING-level issues.""" return [issue for issue in self.issues if issue.severity == 'WARNING'] class ContentValidator: """ Validates content against x-markitect-content-control rules. Checks content patterns, quality metrics, and readability for each section. """ def __init__(self, schema: Dict[str, Any]): """ Initialize validator with a schema. Args: schema: JSON schema with x-markitect-content-control extension """ self.schema = schema self.content_rules = schema.get('x-markitect-content-control', {}) def check(self, document: 'MarkdownDocument') -> ContentValidationResult: """ Validate content against schema rules. Args: document: Parsed markdown document Returns: ContentValidationResult with any issues found """ issues = [] sections_checked = 0 # Check each section that has content rules for section_key, rules in self.content_rules.items(): sections_checked += 1 # Get section from document section = self._get_section(document, section_key) if not section: # Section validator handles missing sections continue section_content = section.get('content', '') section_name = section.get('name', section_key) # Check required patterns issues.extend(self._check_required_patterns( section_name, section_content, rules )) # Check forbidden patterns issues.extend(self._check_forbidden_patterns( section_name, section_content, rules )) # Check discouraged patterns issues.extend(self._check_discouraged_patterns( section_name, section_content, rules )) # Check content quality metrics issues.extend(self._check_quality_metrics( section_name, section_content, rules )) return ContentValidationResult( issues=issues, sections_checked=sections_checked ) def _get_section(self, document: 'MarkdownDocument', section_key: str) -> Optional[Dict[str, Any]]: """ Get a section from the document. Args: document: Parsed markdown document section_key: Section name (lowercase in rules, uppercase in document) Returns: Section dict with name and content, or None if not found """ # Convert section_key to uppercase for matching section_name = section_key.upper() # Try to get section content if hasattr(document, 'get_section'): return document.get_section(section_name) # Fallback: search headings if hasattr(document, 'get_headings_by_level'): headings = document.get_headings_by_level(2) for heading in headings: if isinstance(heading, dict): if heading.get('content', '').strip().upper() == section_name: # Found the section, need to extract content return { 'name': section_name, 'content': heading.get('text_content', '') } return None def _check_required_patterns(self, section_name: str, content: str, rules: Dict[str, Any]) -> List[ContentIssue]: """Check that all required patterns appear in content.""" issues = [] required_patterns = rules.get('required_patterns', []) for pattern in required_patterns: try: if not re.search(pattern, content, re.MULTILINE): issues.append(PatternMissing( section_name=section_name, severity='ERROR', message=f'Required pattern not found', pattern=pattern )) except re.error as e: # Invalid regex pattern in schema issues.append(ContentIssue( section_name=section_name, severity='ERROR', message=f'Invalid regex pattern in schema: {e}' )) return issues def _check_forbidden_patterns(self, section_name: str, content: str, rules: Dict[str, Any]) -> List[ContentIssue]: """Check that no forbidden patterns appear in content.""" issues = [] forbidden_patterns = rules.get('forbidden_patterns', []) for pattern in forbidden_patterns: try: match = re.search(pattern, content, re.MULTILINE) if match: issues.append(ForbiddenPattern( section_name=section_name, severity='ERROR', message=f'Forbidden pattern found', pattern=pattern, matched_text=match.group(0)[:50] # Limit to 50 chars )) except re.error as e: issues.append(ContentIssue( section_name=section_name, severity='ERROR', message=f'Invalid regex pattern in schema: {e}' )) return issues def _check_discouraged_patterns(self, section_name: str, content: str, rules: Dict[str, Any]) -> List[ContentIssue]: """Check for discouraged patterns (warnings).""" issues = [] discouraged_patterns = rules.get('discouraged_patterns', []) for pattern in discouraged_patterns: try: match = re.search(pattern, content, re.MULTILINE) if match: issues.append(DiscouragedPattern( section_name=section_name, severity='WARNING', message=f'Discouraged pattern found', pattern=pattern, matched_text=match.group(0)[:50] )) except re.error as e: issues.append(ContentIssue( section_name=section_name, severity='WARNING', message=f'Invalid regex pattern in schema: {e}' )) return issues def _check_quality_metrics(self, section_name: str, content: str, rules: Dict[str, Any]) -> List[ContentIssue]: """Check content quality metrics (word count, sentence count).""" issues = [] quality = rules.get('content_quality', {}) if not quality: return issues # Word count validation word_count = len(content.split()) min_words = quality.get('min_words') if min_words is not None and word_count < min_words: issues.append(ContentTooShort( section_name=section_name, severity='WARNING', message=f'Content too short ({word_count} words, minimum {min_words})', actual=word_count, required=min_words )) max_words = quality.get('max_words') if max_words is not None and word_count > max_words: issues.append(ContentTooLong( section_name=section_name, severity='WARNING', message=f'Content too long ({word_count} words, maximum {max_words})', actual=word_count, limit=max_words )) # Sentence count validation min_sentences = quality.get('min_sentences') if min_sentences is not None: # Simple sentence count (split by .!?) sentence_count = len(re.findall(r'[.!?]+', content)) if sentence_count < min_sentences: issues.append(ContentTooShort( section_name=section_name, severity='WARNING', message=f'Too few sentences ({sentence_count}, minimum {min_sentences})', actual=sentence_count, required=min_sentences )) return issues