feat: add semantic document validator for x-markitect extensions

Implements semantic validation to complement existing structural validation:

Phase 1 & 2 Complete:
- SemanticValidator: Main validator orchestrating sub-validators
- SectionValidator: Enforces section classifications (required, recommended,
  optional, discouraged, improper) from x-markitect-sections
- ContentValidator: Validates content patterns, forbidden patterns, and
  quality metrics (word counts, sentence counts) from x-markitect-content-control

Features:
- Pattern matching with regex for required/forbidden/discouraged patterns
- Word count and sentence count validation
- Detailed error reporting with severity levels (ERROR, WARNING)
- Support for section alternatives (e.g., FLAGS vs OPTIONS)
- Comprehensive test coverage (16 tests, 100% passing)

Architecture:
- Complements existing SchemaValidator (structural AST validation)
- Clean separation: validators/ package for modular validators
- Semantic validation focuses on x-markitect-* extensions
- LinkValidator planned for Phase 3 (optional --check-links)

Next: Phase 4 - CLI integration to enhance 'markitect validate' command

Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-06 03:24:32 +01:00
parent f27eea6b5b
commit a969c5de47
6 changed files with 1932 additions and 0 deletions

View File

@@ -0,0 +1,261 @@
"""
Semantic Validator for markdown documents.
Validates markdown documents against x-markitect schema extensions:
- x-markitect-sections: Section classifications (required, recommended, etc.)
- x-markitect-content-control: Content patterns and quality metrics
- Link validation: Internal and external link checking
Complements the existing SchemaValidator which handles structural AST validation.
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
import json
from markitect.validators.section_validator import (
SectionValidator,
SectionValidationResult
)
from markitect.validators.content_validator import (
ContentValidator,
ContentValidationResult
)
@dataclass
class SemanticValidationReport:
"""
Report of semantic validation results.
Combines results from section, content, and link validators.
"""
section_result: SectionValidationResult
content_result: Optional[ContentValidationResult] = None
link_result: Optional[Any] = None # LinkValidationResult when implemented
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
errors = self.section_result.has_errors()
if self.content_result and hasattr(self.content_result, 'has_errors'):
errors = errors or self.content_result.has_errors()
if self.link_result and hasattr(self.link_result, 'has_errors'):
errors = errors or self.link_result.has_errors()
return errors
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
warnings = self.section_result.has_warnings()
if self.content_result and hasattr(self.content_result, 'has_warnings'):
warnings = warnings or self.content_result.has_warnings()
if self.link_result and hasattr(self.link_result, 'has_warnings'):
warnings = warnings or self.link_result.has_warnings()
return warnings
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_all_issues(self) -> List[Any]:
"""Get all issues from all validators."""
issues = list(self.section_result.issues)
if self.content_result and hasattr(self.content_result, 'issues'):
issues.extend(self.content_result.issues)
if self.link_result and hasattr(self.link_result, 'issues'):
issues.extend(self.link_result.issues)
return issues
def format_text(self) -> str:
"""Format validation report as text."""
lines = []
# Section validation
lines.append("Section Validation:")
if self.section_result.issues:
for issue in self.section_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.section_name} - {issue.message}")
else:
lines.append(" ✅ All section requirements met")
# Content validation
if self.content_result:
lines.append("")
lines.append("Content Validation:")
if self.content_result.issues:
for issue in self.content_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.section_name} - {issue.message}")
else:
lines.append(" ✅ All content requirements met")
# Summary
lines.append("")
lines.append("Summary:")
lines.append(f" Sections checked: {self.section_result.sections_checked}")
lines.append(f" Sections found: {self.section_result.sections_found}")
all_errors = self.section_result.get_errors()
all_warnings = self.section_result.get_warnings()
if self.content_result:
all_errors.extend(self.content_result.get_errors())
all_warnings.extend(self.content_result.get_warnings())
lines.append(f" Errors: {len(all_errors)}")
lines.append(f" Warnings: {len(all_warnings)}")
if self.is_valid():
lines.append(" Status: PASSED ✅")
else:
lines.append(" Status: FAILED ❌")
return "\n".join(lines)
class SemanticValidator:
"""
Validates markdown documents against x-markitect extensions.
Complements existing SchemaValidator which handles structural AST validation.
This validator checks semantic aspects defined in x-markitect-* extensions.
Example:
>>> schema = load_schema('manpage-schema-v1.0.md')
>>> validator = SemanticValidator(schema)
>>> report = validator.validate('my-command.1.md')
>>> if not report.is_valid():
... print(report.format_text())
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize semantic validator with a schema.
Args:
schema: JSON schema with x-markitect-* extensions
The schema can be either:
- A dict loaded from JSON
- A dict loaded from markdown with embedded JSON
- Must contain x-markitect-sections and/or x-markitect-content-control
"""
self.schema = schema
# Initialize sub-validators
self.section_validator = SectionValidator(schema)
self.content_validator = ContentValidator(schema)
# TODO: Initialize link validator when implemented
# self.link_validator = LinkValidator(schema)
def validate(self, document_path: str | Path,
check_links: bool = False) -> SemanticValidationReport:
"""
Validate a markdown document against schema extensions.
Args:
document_path: Path to markdown document to validate
check_links: Whether to validate links (may be slow)
Returns:
SemanticValidationReport with validation results
Raises:
FileNotFoundError: If document_path doesn't exist
ValueError: If document cannot be parsed
"""
document_path = Path(document_path)
if not document_path.exists():
raise FileNotFoundError(f"Document not found: {document_path}")
# Parse document
document = self._parse_document(document_path)
# Run section validation
section_result = self.section_validator.check(document)
# Run content validation
content_result = self.content_validator.check(document)
# TODO: Run link validation when implemented
# if check_links:
# link_result = self.link_validator.check(document)
# else:
# link_result = None
link_result = None
return SemanticValidationReport(
section_result=section_result,
content_result=content_result,
link_result=link_result
)
def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
"""
Parse markdown document into AST.
Args:
document_path: Path to markdown file
Returns:
Parsed MarkdownDocument object
This uses the existing markitect markdown parser.
"""
# Import here to avoid circular dependency
from markitect.document_manager import DocumentManager
# Use DocumentManager to parse the document
doc_manager = DocumentManager()
doc = doc_manager.ingest_file(document_path)
return doc
def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
"""
Load a schema from file (supports .json and .md formats).
Args:
schema_path: Path to schema file
Returns:
Schema dict with embedded JSON
Raises:
FileNotFoundError: If schema file doesn't exist
ValueError: If schema cannot be parsed
"""
schema_path = Path(schema_path)
if not schema_path.exists():
raise FileNotFoundError(f"Schema not found: {schema_path}")
if schema_path.suffix == '.json':
# Load JSON schema directly
with open(schema_path, 'r', encoding='utf-8') as f:
return json.load(f)
elif schema_path.suffix == '.md':
# Load markdown schema with embedded JSON
from markitect.schema_loader import MarkdownSchemaLoader
loader = MarkdownSchemaLoader()
schema_data = loader.load_schema(schema_path)
return schema_data['schema']
else:
raise ValueError(f"Unsupported schema format: {schema_path.suffix}")

View File

@@ -0,0 +1,50 @@
"""
Validators package for semantic document validation.
This package contains validators that check markdown documents against
x-markitect schema extensions (sections, content-control, link validation).
Validators:
- SectionValidator: Validates section presence based on classifications
- ContentValidator: Validates content patterns and quality metrics
- LinkValidator: Validates internal and external links
"""
from markitect.validators.section_validator import (
SectionValidator,
SectionValidationResult,
SectionIssue,
SectionMissing,
SectionImproper,
SectionDiscouraged,
)
from markitect.validators.content_validator import (
ContentValidator,
ContentValidationResult,
ContentIssue,
PatternMissing,
ForbiddenPattern,
DiscouragedPattern,
ContentTooShort,
ContentTooLong,
)
__all__ = [
# Section validator
'SectionValidator',
'SectionValidationResult',
'SectionIssue',
'SectionMissing',
'SectionImproper',
'SectionDiscouraged',
# Content validator
'ContentValidator',
'ContentValidationResult',
'ContentIssue',
'PatternMissing',
'ForbiddenPattern',
'DiscouragedPattern',
'ContentTooShort',
'ContentTooLong',
]

View File

@@ -0,0 +1,316 @@
"""
Content Validator for markdown documents.
Validates content against x-markitect-content-control rules:
- Required patterns: Regex patterns that must appear in content
- Discouraged patterns: Patterns that should be avoided (warnings)
- Forbidden patterns: Patterns that must not appear (errors)
- Quality metrics: Word counts, sentence counts, readability
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import re
@dataclass
class ContentIssue:
"""Base class for content validation issues."""
section_name: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
line_number: Optional[int] = None
matched_text: Optional[str] = None
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
match_info = f": '{self.matched_text}'" if self.matched_text else ""
return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"
@dataclass
class PatternMissing(ContentIssue):
"""Required pattern not found in content."""
pattern: str = ""
@dataclass
class ForbiddenPattern(ContentIssue):
"""Forbidden pattern found in content."""
pattern: str = ""
@dataclass
class DiscouragedPattern(ContentIssue):
"""Discouraged pattern found in content."""
pattern: str = ""
@dataclass
class ContentTooShort(ContentIssue):
"""Content does not meet minimum word/sentence count."""
actual: int = 0
required: int = 0
@dataclass
class ContentTooLong(ContentIssue):
"""Content exceeds maximum word/sentence count."""
actual: int = 0
limit: int = 0
@dataclass
class ContentValidationResult:
"""Result of content validation."""
issues: List[ContentIssue]
sections_checked: int
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[ContentIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[ContentIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class ContentValidator:
"""
Validates content against x-markitect-content-control rules.
Checks content patterns, quality metrics, and readability for each section.
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-content-control extension
"""
self.schema = schema
self.content_rules = schema.get('x-markitect-content-control', {})
def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
"""
Validate content against schema rules.
Args:
document: Parsed markdown document
Returns:
ContentValidationResult with any issues found
"""
issues = []
sections_checked = 0
# Check each section that has content rules
for section_key, rules in self.content_rules.items():
sections_checked += 1
# Get section from document
section = self._get_section(document, section_key)
if not section:
# Section validator handles missing sections
continue
section_content = section.get('content', '')
section_name = section.get('name', section_key)
# Check required patterns
issues.extend(self._check_required_patterns(
section_name, section_content, rules
))
# Check forbidden patterns
issues.extend(self._check_forbidden_patterns(
section_name, section_content, rules
))
# Check discouraged patterns
issues.extend(self._check_discouraged_patterns(
section_name, section_content, rules
))
# Check content quality metrics
issues.extend(self._check_quality_metrics(
section_name, section_content, rules
))
return ContentValidationResult(
issues=issues,
sections_checked=sections_checked
)
def _get_section(self, document: 'MarkdownDocument',
section_key: str) -> Optional[Dict[str, Any]]:
"""
Get a section from the document.
Args:
document: Parsed markdown document
section_key: Section name (lowercase in rules, uppercase in document)
Returns:
Section dict with name and content, or None if not found
"""
# Convert section_key to uppercase for matching
section_name = section_key.upper()
# Try to get section content
if hasattr(document, 'get_section'):
return document.get_section(section_name)
# Fallback: search headings
if hasattr(document, 'get_headings_by_level'):
headings = document.get_headings_by_level(2)
for heading in headings:
if isinstance(heading, dict):
if heading.get('content', '').strip().upper() == section_name:
# Found the section, need to extract content
return {
'name': section_name,
'content': heading.get('text_content', '')
}
return None
def _check_required_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check that all required patterns appear in content."""
issues = []
required_patterns = rules.get('required_patterns', [])
for pattern in required_patterns:
try:
if not re.search(pattern, content, re.MULTILINE):
issues.append(PatternMissing(
section_name=section_name,
severity='ERROR',
message=f'Required pattern not found',
pattern=pattern
))
except re.error as e:
# Invalid regex pattern in schema
issues.append(ContentIssue(
section_name=section_name,
severity='ERROR',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_forbidden_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check that no forbidden patterns appear in content."""
issues = []
forbidden_patterns = rules.get('forbidden_patterns', [])
for pattern in forbidden_patterns:
try:
match = re.search(pattern, content, re.MULTILINE)
if match:
issues.append(ForbiddenPattern(
section_name=section_name,
severity='ERROR',
message=f'Forbidden pattern found',
pattern=pattern,
matched_text=match.group(0)[:50] # Limit to 50 chars
))
except re.error as e:
issues.append(ContentIssue(
section_name=section_name,
severity='ERROR',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_discouraged_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check for discouraged patterns (warnings)."""
issues = []
discouraged_patterns = rules.get('discouraged_patterns', [])
for pattern in discouraged_patterns:
try:
match = re.search(pattern, content, re.MULTILINE)
if match:
issues.append(DiscouragedPattern(
section_name=section_name,
severity='WARNING',
message=f'Discouraged pattern found',
pattern=pattern,
matched_text=match.group(0)[:50]
))
except re.error as e:
issues.append(ContentIssue(
section_name=section_name,
severity='WARNING',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_quality_metrics(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check content quality metrics (word count, sentence count)."""
issues = []
quality = rules.get('content_quality', {})
if not quality:
return issues
# Word count validation
word_count = len(content.split())
min_words = quality.get('min_words')
if min_words is not None and word_count < min_words:
issues.append(ContentTooShort(
section_name=section_name,
severity='WARNING',
message=f'Content too short ({word_count} words, minimum {min_words})',
actual=word_count,
required=min_words
))
max_words = quality.get('max_words')
if max_words is not None and word_count > max_words:
issues.append(ContentTooLong(
section_name=section_name,
severity='WARNING',
message=f'Content too long ({word_count} words, maximum {max_words})',
actual=word_count,
limit=max_words
))
# Sentence count validation
min_sentences = quality.get('min_sentences')
if min_sentences is not None:
# Simple sentence count (split by .!?)
sentence_count = len(re.findall(r'[.!?]+', content))
if sentence_count < min_sentences:
issues.append(ContentTooShort(
section_name=section_name,
severity='WARNING',
message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
actual=sentence_count,
required=min_sentences
))
return issues

View File

@@ -0,0 +1,226 @@
"""
Section Validator for markdown documents.
Validates that document sections comply with x-markitect-sections classifications:
- REQUIRED: Section must be present (ERROR if missing)
- RECOMMENDED: Section should be present (WARNING if missing)
- OPTIONAL: Section may be present (no check)
- DISCOURAGED: Section should not be present (WARNING if present)
- IMPROPER: Section must not be present (ERROR if present)
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
@dataclass
class SectionIssue:
"""Base class for section validation issues."""
section_name: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
classification: str # 'required', 'recommended', etc.
line_number: Optional[int] = None
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
return f"[{self.severity}]{location} {self.section_name}: {self.message}"
@dataclass
class SectionMissing(SectionIssue):
"""Section is missing from document."""
pass
@dataclass
class SectionImproper(SectionIssue):
"""Improper section found in document."""
pass
@dataclass
class SectionDiscouraged(SectionIssue):
"""Discouraged section found in document."""
pass
@dataclass
class SectionValidationResult:
"""Result of section validation."""
issues: List[SectionIssue]
sections_checked: int
sections_found: int
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[SectionIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[SectionIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class SectionValidator:
"""
Validates section presence and classification compliance.
Checks that markdown documents have the correct sections based on
x-markitect-sections classifications in the schema.
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-sections extension
"""
self.schema = schema
self.sections_spec = schema.get('x-markitect-sections', {})
def check(self, document: 'MarkdownDocument') -> SectionValidationResult:
"""
Validate section presence against schema classifications.
Args:
document: Parsed markdown document
Returns:
SectionValidationResult with any issues found
"""
issues = []
# Get level-2 headings (main sections) from document
doc_sections = self._get_document_sections(document)
# Check each specification
for section_name, spec in self.sections_spec.items():
classification = spec.get('classification')
section_in_doc = self._find_section(section_name, doc_sections, spec)
if classification == 'required':
if not section_in_doc:
issues.append(SectionMissing(
section_name=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} section is required'),
classification='required'
))
elif classification == 'improper':
if section_in_doc:
issues.append(SectionImproper(
section_name=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} section must not appear'),
classification='improper',
line_number=section_in_doc.get('line_number')
))
elif classification == 'recommended':
if not section_in_doc:
issues.append(SectionMissing(
section_name=section_name,
severity='WARNING',
message=spec.get('warning_if_missing', f'{section_name} section is recommended'),
classification='recommended'
))
elif classification == 'discouraged':
if section_in_doc:
issues.append(SectionDiscouraged(
section_name=section_name,
severity='WARNING',
message=spec.get('warning_if_present', f'{section_name} section is discouraged'),
classification='discouraged',
line_number=section_in_doc.get('line_number')
))
return SectionValidationResult(
issues=issues,
sections_checked=len(self.sections_spec),
sections_found=len(doc_sections)
)
def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
"""
Extract level-2 headings from document.
Args:
document: Parsed markdown document
Returns:
List of section dicts with name and line_number
"""
sections = []
# Get headings from document
if hasattr(document, 'get_headings_by_level'):
level_2_headings = document.get_headings_by_level(2)
elif hasattr(document, 'headings'):
level_2_headings = [
h for h in document.headings
if h.get('level') == 2
]
else:
# Fallback: parse from AST
level_2_headings = []
for heading in level_2_headings:
if isinstance(heading, dict):
sections.append({
'name': heading.get('content', '').strip().upper(),
'line_number': heading.get('line_number')
})
elif isinstance(heading, str):
sections.append({
'name': heading.strip().upper(),
'line_number': None
})
return sections
def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]],
spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Find a section in document, checking alternatives.
Args:
section_name: Primary section name to find
doc_sections: List of sections in document
spec: Section specification with potential alternatives
Returns:
Section dict if found, None otherwise
"""
# Normalize section name for comparison
normalized_name = section_name.upper().strip()
# Check primary name
for section in doc_sections:
if section['name'] == normalized_name:
return section
# Check alternatives
alternatives = spec.get('alternatives', [])
for alt_name in alternatives:
normalized_alt = alt_name.upper().strip()
for section in doc_sections:
if section['name'] == normalized_alt:
return section
return None