feat: add semantic document validator for x-markitect extensions
Implements semantic validation to complement existing structural validation: Phase 1 & 2 Complete: - SemanticValidator: Main validator orchestrating sub-validators - SectionValidator: Enforces section classifications (required, recommended, optional, discouraged, improper) from x-markitect-sections - ContentValidator: Validates content patterns, forbidden patterns, and quality metrics (word counts, sentence counts) from x-markitect-content-control Features: - Pattern matching with regex for required/forbidden/discouraged patterns - Word count and sentence count validation - Detailed error reporting with severity levels (ERROR, WARNING) - Support for section alternatives (e.g., FLAGS vs OPTIONS) - Comprehensive test coverage (16 tests, 100% passing) Architecture: - Complements existing SchemaValidator (structural AST validation) - Clean separation: validators/ package for modular validators - Semantic validation focuses on x-markitect-* extensions - LinkValidator planned for Phase 3 (optional --check-links) Next: Phase 4 - CLI integration to enhance 'markitect validate' command Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
261
markitect/semantic_validator.py
Normal file
261
markitect/semantic_validator.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""
|
||||
Semantic Validator for markdown documents.
|
||||
|
||||
Validates markdown documents against x-markitect schema extensions:
|
||||
- x-markitect-sections: Section classifications (required, recommended, etc.)
|
||||
- x-markitect-content-control: Content patterns and quality metrics
|
||||
- Link validation: Internal and external link checking
|
||||
|
||||
Complements the existing SchemaValidator which handles structural AST validation.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
import json
|
||||
|
||||
from markitect.validators.section_validator import (
|
||||
SectionValidator,
|
||||
SectionValidationResult
|
||||
)
|
||||
from markitect.validators.content_validator import (
|
||||
ContentValidator,
|
||||
ContentValidationResult
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SemanticValidationReport:
|
||||
"""
|
||||
Report of semantic validation results.
|
||||
|
||||
Combines results from section, content, and link validators.
|
||||
"""
|
||||
section_result: SectionValidationResult
|
||||
content_result: Optional[ContentValidationResult] = None
|
||||
link_result: Optional[Any] = None # LinkValidationResult when implemented
|
||||
|
||||
def has_errors(self) -> bool:
|
||||
"""Check if there are any ERROR-level issues."""
|
||||
errors = self.section_result.has_errors()
|
||||
|
||||
if self.content_result and hasattr(self.content_result, 'has_errors'):
|
||||
errors = errors or self.content_result.has_errors()
|
||||
|
||||
if self.link_result and hasattr(self.link_result, 'has_errors'):
|
||||
errors = errors or self.link_result.has_errors()
|
||||
|
||||
return errors
|
||||
|
||||
def has_warnings(self) -> bool:
|
||||
"""Check if there are any WARNING-level issues."""
|
||||
warnings = self.section_result.has_warnings()
|
||||
|
||||
if self.content_result and hasattr(self.content_result, 'has_warnings'):
|
||||
warnings = warnings or self.content_result.has_warnings()
|
||||
|
||||
if self.link_result and hasattr(self.link_result, 'has_warnings'):
|
||||
warnings = warnings or self.link_result.has_warnings()
|
||||
|
||||
return warnings
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
"""Check if validation passed (no errors)."""
|
||||
return not self.has_errors()
|
||||
|
||||
def get_all_issues(self) -> List[Any]:
|
||||
"""Get all issues from all validators."""
|
||||
issues = list(self.section_result.issues)
|
||||
|
||||
if self.content_result and hasattr(self.content_result, 'issues'):
|
||||
issues.extend(self.content_result.issues)
|
||||
|
||||
if self.link_result and hasattr(self.link_result, 'issues'):
|
||||
issues.extend(self.link_result.issues)
|
||||
|
||||
return issues
|
||||
|
||||
def format_text(self) -> str:
|
||||
"""Format validation report as text."""
|
||||
lines = []
|
||||
|
||||
# Section validation
|
||||
lines.append("Section Validation:")
|
||||
if self.section_result.issues:
|
||||
for issue in self.section_result.issues:
|
||||
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
||||
lines.append(f" {status} {issue.section_name} - {issue.message}")
|
||||
else:
|
||||
lines.append(" ✅ All section requirements met")
|
||||
|
||||
# Content validation
|
||||
if self.content_result:
|
||||
lines.append("")
|
||||
lines.append("Content Validation:")
|
||||
if self.content_result.issues:
|
||||
for issue in self.content_result.issues:
|
||||
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
||||
lines.append(f" {status} {issue.section_name} - {issue.message}")
|
||||
else:
|
||||
lines.append(" ✅ All content requirements met")
|
||||
|
||||
# Summary
|
||||
lines.append("")
|
||||
lines.append("Summary:")
|
||||
lines.append(f" Sections checked: {self.section_result.sections_checked}")
|
||||
lines.append(f" Sections found: {self.section_result.sections_found}")
|
||||
|
||||
all_errors = self.section_result.get_errors()
|
||||
all_warnings = self.section_result.get_warnings()
|
||||
|
||||
if self.content_result:
|
||||
all_errors.extend(self.content_result.get_errors())
|
||||
all_warnings.extend(self.content_result.get_warnings())
|
||||
|
||||
lines.append(f" Errors: {len(all_errors)}")
|
||||
lines.append(f" Warnings: {len(all_warnings)}")
|
||||
|
||||
if self.is_valid():
|
||||
lines.append(" Status: PASSED ✅")
|
||||
else:
|
||||
lines.append(" Status: FAILED ❌")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class SemanticValidator:
|
||||
"""
|
||||
Validates markdown documents against x-markitect extensions.
|
||||
|
||||
Complements existing SchemaValidator which handles structural AST validation.
|
||||
This validator checks semantic aspects defined in x-markitect-* extensions.
|
||||
|
||||
Example:
|
||||
>>> schema = load_schema('manpage-schema-v1.0.md')
|
||||
>>> validator = SemanticValidator(schema)
|
||||
>>> report = validator.validate('my-command.1.md')
|
||||
>>> if not report.is_valid():
|
||||
... print(report.format_text())
|
||||
"""
|
||||
|
||||
def __init__(self, schema: Dict[str, Any]):
|
||||
"""
|
||||
Initialize semantic validator with a schema.
|
||||
|
||||
Args:
|
||||
schema: JSON schema with x-markitect-* extensions
|
||||
|
||||
The schema can be either:
|
||||
- A dict loaded from JSON
|
||||
- A dict loaded from markdown with embedded JSON
|
||||
- Must contain x-markitect-sections and/or x-markitect-content-control
|
||||
"""
|
||||
self.schema = schema
|
||||
|
||||
# Initialize sub-validators
|
||||
self.section_validator = SectionValidator(schema)
|
||||
self.content_validator = ContentValidator(schema)
|
||||
|
||||
# TODO: Initialize link validator when implemented
|
||||
# self.link_validator = LinkValidator(schema)
|
||||
|
||||
def validate(self, document_path: str | Path,
|
||||
check_links: bool = False) -> SemanticValidationReport:
|
||||
"""
|
||||
Validate a markdown document against schema extensions.
|
||||
|
||||
Args:
|
||||
document_path: Path to markdown document to validate
|
||||
check_links: Whether to validate links (may be slow)
|
||||
|
||||
Returns:
|
||||
SemanticValidationReport with validation results
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If document_path doesn't exist
|
||||
ValueError: If document cannot be parsed
|
||||
"""
|
||||
document_path = Path(document_path)
|
||||
|
||||
if not document_path.exists():
|
||||
raise FileNotFoundError(f"Document not found: {document_path}")
|
||||
|
||||
# Parse document
|
||||
document = self._parse_document(document_path)
|
||||
|
||||
# Run section validation
|
||||
section_result = self.section_validator.check(document)
|
||||
|
||||
# Run content validation
|
||||
content_result = self.content_validator.check(document)
|
||||
|
||||
# TODO: Run link validation when implemented
|
||||
# if check_links:
|
||||
# link_result = self.link_validator.check(document)
|
||||
# else:
|
||||
# link_result = None
|
||||
link_result = None
|
||||
|
||||
return SemanticValidationReport(
|
||||
section_result=section_result,
|
||||
content_result=content_result,
|
||||
link_result=link_result
|
||||
)
|
||||
|
||||
def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
|
||||
"""
|
||||
Parse markdown document into AST.
|
||||
|
||||
Args:
|
||||
document_path: Path to markdown file
|
||||
|
||||
Returns:
|
||||
Parsed MarkdownDocument object
|
||||
|
||||
This uses the existing markitect markdown parser.
|
||||
"""
|
||||
# Import here to avoid circular dependency
|
||||
from markitect.document_manager import DocumentManager
|
||||
|
||||
# Use DocumentManager to parse the document
|
||||
doc_manager = DocumentManager()
|
||||
doc = doc_manager.ingest_file(document_path)
|
||||
|
||||
return doc
|
||||
|
||||
|
||||
def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
|
||||
"""
|
||||
Load a schema from file (supports .json and .md formats).
|
||||
|
||||
Args:
|
||||
schema_path: Path to schema file
|
||||
|
||||
Returns:
|
||||
Schema dict with embedded JSON
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If schema file doesn't exist
|
||||
ValueError: If schema cannot be parsed
|
||||
"""
|
||||
schema_path = Path(schema_path)
|
||||
|
||||
if not schema_path.exists():
|
||||
raise FileNotFoundError(f"Schema not found: {schema_path}")
|
||||
|
||||
if schema_path.suffix == '.json':
|
||||
# Load JSON schema directly
|
||||
with open(schema_path, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
|
||||
elif schema_path.suffix == '.md':
|
||||
# Load markdown schema with embedded JSON
|
||||
from markitect.schema_loader import MarkdownSchemaLoader
|
||||
|
||||
loader = MarkdownSchemaLoader()
|
||||
schema_data = loader.load_schema(schema_path)
|
||||
|
||||
return schema_data['schema']
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unsupported schema format: {schema_path.suffix}")
|
||||
Reference in New Issue
Block a user