feat: add semantic document validator for x-markitect extensions

Implements semantic validation to complement existing structural validation:

Phase 1 & 2 Complete:
- SemanticValidator: Main validator orchestrating sub-validators
- SectionValidator: Enforces section classifications (required, recommended,
  optional, discouraged, improper) from x-markitect-sections
- ContentValidator: Validates content patterns, forbidden patterns, and
  quality metrics (word counts, sentence counts) from x-markitect-content-control

Features:
- Pattern matching with regex for required/forbidden/discouraged patterns
- Word count and sentence count validation
- Detailed error reporting with severity levels (ERROR, WARNING)
- Support for section alternatives (e.g., FLAGS vs OPTIONS)
- Comprehensive test coverage (16 tests, 100% passing)

Architecture:
- Complements existing SchemaValidator (structural AST validation)
- Clean separation: validators/ package for modular validators
- Semantic validation focuses on x-markitect-* extensions
- LinkValidator planned for Phase 3 (optional --check-links)

Next: Phase 4 - CLI integration to enhance 'markitect validate' command

Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-06 03:24:32 +01:00
parent f27eea6b5b
commit a969c5de47
6 changed files with 1932 additions and 0 deletions

View File

@@ -0,0 +1,50 @@
"""
Validators package for semantic document validation.
This package contains validators that check markdown documents against
x-markitect schema extensions (sections, content-control, link validation).
Validators:
- SectionValidator: Validates section presence based on classifications
- ContentValidator: Validates content patterns and quality metrics
- LinkValidator: Validates internal and external links
"""
from markitect.validators.section_validator import (
SectionValidator,
SectionValidationResult,
SectionIssue,
SectionMissing,
SectionImproper,
SectionDiscouraged,
)
from markitect.validators.content_validator import (
ContentValidator,
ContentValidationResult,
ContentIssue,
PatternMissing,
ForbiddenPattern,
DiscouragedPattern,
ContentTooShort,
ContentTooLong,
)
__all__ = [
# Section validator
'SectionValidator',
'SectionValidationResult',
'SectionIssue',
'SectionMissing',
'SectionImproper',
'SectionDiscouraged',
# Content validator
'ContentValidator',
'ContentValidationResult',
'ContentIssue',
'PatternMissing',
'ForbiddenPattern',
'DiscouragedPattern',
'ContentTooShort',
'ContentTooLong',
]

View File

@@ -0,0 +1,316 @@
"""
Content Validator for markdown documents.
Validates content against x-markitect-content-control rules:
- Required patterns: Regex patterns that must appear in content
- Discouraged patterns: Patterns that should be avoided (warnings)
- Forbidden patterns: Patterns that must not appear (errors)
- Quality metrics: Word counts, sentence counts, readability
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import re
@dataclass
class ContentIssue:
"""Base class for content validation issues."""
section_name: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
line_number: Optional[int] = None
matched_text: Optional[str] = None
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
match_info = f": '{self.matched_text}'" if self.matched_text else ""
return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"
@dataclass
class PatternMissing(ContentIssue):
"""Required pattern not found in content."""
pattern: str = ""
@dataclass
class ForbiddenPattern(ContentIssue):
"""Forbidden pattern found in content."""
pattern: str = ""
@dataclass
class DiscouragedPattern(ContentIssue):
"""Discouraged pattern found in content."""
pattern: str = ""
@dataclass
class ContentTooShort(ContentIssue):
"""Content does not meet minimum word/sentence count."""
actual: int = 0
required: int = 0
@dataclass
class ContentTooLong(ContentIssue):
"""Content exceeds maximum word/sentence count."""
actual: int = 0
limit: int = 0
@dataclass
class ContentValidationResult:
"""Result of content validation."""
issues: List[ContentIssue]
sections_checked: int
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[ContentIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[ContentIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class ContentValidator:
"""
Validates content against x-markitect-content-control rules.
Checks content patterns, quality metrics, and readability for each section.
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-content-control extension
"""
self.schema = schema
self.content_rules = schema.get('x-markitect-content-control', {})
def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
"""
Validate content against schema rules.
Args:
document: Parsed markdown document
Returns:
ContentValidationResult with any issues found
"""
issues = []
sections_checked = 0
# Check each section that has content rules
for section_key, rules in self.content_rules.items():
sections_checked += 1
# Get section from document
section = self._get_section(document, section_key)
if not section:
# Section validator handles missing sections
continue
section_content = section.get('content', '')
section_name = section.get('name', section_key)
# Check required patterns
issues.extend(self._check_required_patterns(
section_name, section_content, rules
))
# Check forbidden patterns
issues.extend(self._check_forbidden_patterns(
section_name, section_content, rules
))
# Check discouraged patterns
issues.extend(self._check_discouraged_patterns(
section_name, section_content, rules
))
# Check content quality metrics
issues.extend(self._check_quality_metrics(
section_name, section_content, rules
))
return ContentValidationResult(
issues=issues,
sections_checked=sections_checked
)
def _get_section(self, document: 'MarkdownDocument',
section_key: str) -> Optional[Dict[str, Any]]:
"""
Get a section from the document.
Args:
document: Parsed markdown document
section_key: Section name (lowercase in rules, uppercase in document)
Returns:
Section dict with name and content, or None if not found
"""
# Convert section_key to uppercase for matching
section_name = section_key.upper()
# Try to get section content
if hasattr(document, 'get_section'):
return document.get_section(section_name)
# Fallback: search headings
if hasattr(document, 'get_headings_by_level'):
headings = document.get_headings_by_level(2)
for heading in headings:
if isinstance(heading, dict):
if heading.get('content', '').strip().upper() == section_name:
# Found the section, need to extract content
return {
'name': section_name,
'content': heading.get('text_content', '')
}
return None
def _check_required_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check that all required patterns appear in content."""
issues = []
required_patterns = rules.get('required_patterns', [])
for pattern in required_patterns:
try:
if not re.search(pattern, content, re.MULTILINE):
issues.append(PatternMissing(
section_name=section_name,
severity='ERROR',
message=f'Required pattern not found',
pattern=pattern
))
except re.error as e:
# Invalid regex pattern in schema
issues.append(ContentIssue(
section_name=section_name,
severity='ERROR',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_forbidden_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check that no forbidden patterns appear in content."""
issues = []
forbidden_patterns = rules.get('forbidden_patterns', [])
for pattern in forbidden_patterns:
try:
match = re.search(pattern, content, re.MULTILINE)
if match:
issues.append(ForbiddenPattern(
section_name=section_name,
severity='ERROR',
message=f'Forbidden pattern found',
pattern=pattern,
matched_text=match.group(0)[:50] # Limit to 50 chars
))
except re.error as e:
issues.append(ContentIssue(
section_name=section_name,
severity='ERROR',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_discouraged_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check for discouraged patterns (warnings)."""
issues = []
discouraged_patterns = rules.get('discouraged_patterns', [])
for pattern in discouraged_patterns:
try:
match = re.search(pattern, content, re.MULTILINE)
if match:
issues.append(DiscouragedPattern(
section_name=section_name,
severity='WARNING',
message=f'Discouraged pattern found',
pattern=pattern,
matched_text=match.group(0)[:50]
))
except re.error as e:
issues.append(ContentIssue(
section_name=section_name,
severity='WARNING',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_quality_metrics(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check content quality metrics (word count, sentence count)."""
issues = []
quality = rules.get('content_quality', {})
if not quality:
return issues
# Word count validation
word_count = len(content.split())
min_words = quality.get('min_words')
if min_words is not None and word_count < min_words:
issues.append(ContentTooShort(
section_name=section_name,
severity='WARNING',
message=f'Content too short ({word_count} words, minimum {min_words})',
actual=word_count,
required=min_words
))
max_words = quality.get('max_words')
if max_words is not None and word_count > max_words:
issues.append(ContentTooLong(
section_name=section_name,
severity='WARNING',
message=f'Content too long ({word_count} words, maximum {max_words})',
actual=word_count,
limit=max_words
))
# Sentence count validation
min_sentences = quality.get('min_sentences')
if min_sentences is not None:
# Simple sentence count (split by .!?)
sentence_count = len(re.findall(r'[.!?]+', content))
if sentence_count < min_sentences:
issues.append(ContentTooShort(
section_name=section_name,
severity='WARNING',
message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
actual=sentence_count,
required=min_sentences
))
return issues

View File

@@ -0,0 +1,226 @@
"""
Section Validator for markdown documents.
Validates that document sections comply with x-markitect-sections classifications:
- REQUIRED: Section must be present (ERROR if missing)
- RECOMMENDED: Section should be present (WARNING if missing)
- OPTIONAL: Section may be present (no check)
- DISCOURAGED: Section should not be present (WARNING if present)
- IMPROPER: Section must not be present (ERROR if present)
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
@dataclass
class SectionIssue:
"""Base class for section validation issues."""
section_name: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
classification: str # 'required', 'recommended', etc.
line_number: Optional[int] = None
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
return f"[{self.severity}]{location} {self.section_name}: {self.message}"
@dataclass
class SectionMissing(SectionIssue):
"""Section is missing from document."""
pass
@dataclass
class SectionImproper(SectionIssue):
"""Improper section found in document."""
pass
@dataclass
class SectionDiscouraged(SectionIssue):
"""Discouraged section found in document."""
pass
@dataclass
class SectionValidationResult:
"""Result of section validation."""
issues: List[SectionIssue]
sections_checked: int
sections_found: int
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[SectionIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[SectionIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class SectionValidator:
"""
Validates section presence and classification compliance.
Checks that markdown documents have the correct sections based on
x-markitect-sections classifications in the schema.
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-sections extension
"""
self.schema = schema
self.sections_spec = schema.get('x-markitect-sections', {})
def check(self, document: 'MarkdownDocument') -> SectionValidationResult:
"""
Validate section presence against schema classifications.
Args:
document: Parsed markdown document
Returns:
SectionValidationResult with any issues found
"""
issues = []
# Get level-2 headings (main sections) from document
doc_sections = self._get_document_sections(document)
# Check each specification
for section_name, spec in self.sections_spec.items():
classification = spec.get('classification')
section_in_doc = self._find_section(section_name, doc_sections, spec)
if classification == 'required':
if not section_in_doc:
issues.append(SectionMissing(
section_name=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} section is required'),
classification='required'
))
elif classification == 'improper':
if section_in_doc:
issues.append(SectionImproper(
section_name=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} section must not appear'),
classification='improper',
line_number=section_in_doc.get('line_number')
))
elif classification == 'recommended':
if not section_in_doc:
issues.append(SectionMissing(
section_name=section_name,
severity='WARNING',
message=spec.get('warning_if_missing', f'{section_name} section is recommended'),
classification='recommended'
))
elif classification == 'discouraged':
if section_in_doc:
issues.append(SectionDiscouraged(
section_name=section_name,
severity='WARNING',
message=spec.get('warning_if_present', f'{section_name} section is discouraged'),
classification='discouraged',
line_number=section_in_doc.get('line_number')
))
return SectionValidationResult(
issues=issues,
sections_checked=len(self.sections_spec),
sections_found=len(doc_sections)
)
def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
"""
Extract level-2 headings from document.
Args:
document: Parsed markdown document
Returns:
List of section dicts with name and line_number
"""
sections = []
# Get headings from document
if hasattr(document, 'get_headings_by_level'):
level_2_headings = document.get_headings_by_level(2)
elif hasattr(document, 'headings'):
level_2_headings = [
h for h in document.headings
if h.get('level') == 2
]
else:
# Fallback: parse from AST
level_2_headings = []
for heading in level_2_headings:
if isinstance(heading, dict):
sections.append({
'name': heading.get('content', '').strip().upper(),
'line_number': heading.get('line_number')
})
elif isinstance(heading, str):
sections.append({
'name': heading.strip().upper(),
'line_number': None
})
return sections
def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]],
spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Find a section in document, checking alternatives.
Args:
section_name: Primary section name to find
doc_sections: List of sections in document
spec: Section specification with potential alternatives
Returns:
Section dict if found, None otherwise
"""
# Normalize section name for comparison
normalized_name = section_name.upper().strip()
# Check primary name
for section in doc_sections:
if section['name'] == normalized_name:
return section
# Check alternatives
alternatives = spec.get('alternatives', [])
for alt_name in alternatives:
normalized_alt = alt_name.upper().strip()
for section in doc_sections:
if section['name'] == normalized_alt:
return section
return None