Implements semantic validation to complement existing structural validation: Phase 1 & 2 Complete: - SemanticValidator: Main validator orchestrating sub-validators - SectionValidator: Enforces section classifications (required, recommended, optional, discouraged, improper) from x-markitect-sections - ContentValidator: Validates content patterns, forbidden patterns, and quality metrics (word counts, sentence counts) from x-markitect-content-control Features: - Pattern matching with regex for required/forbidden/discouraged patterns - Word count and sentence count validation - Detailed error reporting with severity levels (ERROR, WARNING) - Support for section alternatives (e.g., FLAGS vs OPTIONS) - Comprehensive test coverage (16 tests, 100% passing) Architecture: - Complements existing SchemaValidator (structural AST validation) - Clean separation: validators/ package for modular validators - Semantic validation focuses on x-markitect-* extensions - LinkValidator planned for Phase 3 (optional --check-links) Next: Phase 4 - CLI integration to enhance 'markitect validate' command Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
262 lines
8.4 KiB
Python
262 lines
8.4 KiB
Python
"""
|
|
Semantic Validator for markdown documents.
|
|
|
|
Validates markdown documents against x-markitect schema extensions:
|
|
- x-markitect-sections: Section classifications (required, recommended, etc.)
|
|
- x-markitect-content-control: Content patterns and quality metrics
|
|
- Link validation: Internal and external link checking
|
|
|
|
Complements the existing SchemaValidator which handles structural AST validation.
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import List, Dict, Any, Optional
|
|
from pathlib import Path
|
|
import json
|
|
|
|
from markitect.validators.section_validator import (
|
|
SectionValidator,
|
|
SectionValidationResult
|
|
)
|
|
from markitect.validators.content_validator import (
|
|
ContentValidator,
|
|
ContentValidationResult
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class SemanticValidationReport:
|
|
"""
|
|
Report of semantic validation results.
|
|
|
|
Combines results from section, content, and link validators.
|
|
"""
|
|
section_result: SectionValidationResult
|
|
content_result: Optional[ContentValidationResult] = None
|
|
link_result: Optional[Any] = None # LinkValidationResult when implemented
|
|
|
|
def has_errors(self) -> bool:
|
|
"""Check if there are any ERROR-level issues."""
|
|
errors = self.section_result.has_errors()
|
|
|
|
if self.content_result and hasattr(self.content_result, 'has_errors'):
|
|
errors = errors or self.content_result.has_errors()
|
|
|
|
if self.link_result and hasattr(self.link_result, 'has_errors'):
|
|
errors = errors or self.link_result.has_errors()
|
|
|
|
return errors
|
|
|
|
def has_warnings(self) -> bool:
|
|
"""Check if there are any WARNING-level issues."""
|
|
warnings = self.section_result.has_warnings()
|
|
|
|
if self.content_result and hasattr(self.content_result, 'has_warnings'):
|
|
warnings = warnings or self.content_result.has_warnings()
|
|
|
|
if self.link_result and hasattr(self.link_result, 'has_warnings'):
|
|
warnings = warnings or self.link_result.has_warnings()
|
|
|
|
return warnings
|
|
|
|
def is_valid(self) -> bool:
|
|
"""Check if validation passed (no errors)."""
|
|
return not self.has_errors()
|
|
|
|
def get_all_issues(self) -> List[Any]:
|
|
"""Get all issues from all validators."""
|
|
issues = list(self.section_result.issues)
|
|
|
|
if self.content_result and hasattr(self.content_result, 'issues'):
|
|
issues.extend(self.content_result.issues)
|
|
|
|
if self.link_result and hasattr(self.link_result, 'issues'):
|
|
issues.extend(self.link_result.issues)
|
|
|
|
return issues
|
|
|
|
def format_text(self) -> str:
|
|
"""Format validation report as text."""
|
|
lines = []
|
|
|
|
# Section validation
|
|
lines.append("Section Validation:")
|
|
if self.section_result.issues:
|
|
for issue in self.section_result.issues:
|
|
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
|
lines.append(f" {status} {issue.section_name} - {issue.message}")
|
|
else:
|
|
lines.append(" ✅ All section requirements met")
|
|
|
|
# Content validation
|
|
if self.content_result:
|
|
lines.append("")
|
|
lines.append("Content Validation:")
|
|
if self.content_result.issues:
|
|
for issue in self.content_result.issues:
|
|
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
|
lines.append(f" {status} {issue.section_name} - {issue.message}")
|
|
else:
|
|
lines.append(" ✅ All content requirements met")
|
|
|
|
# Summary
|
|
lines.append("")
|
|
lines.append("Summary:")
|
|
lines.append(f" Sections checked: {self.section_result.sections_checked}")
|
|
lines.append(f" Sections found: {self.section_result.sections_found}")
|
|
|
|
all_errors = self.section_result.get_errors()
|
|
all_warnings = self.section_result.get_warnings()
|
|
|
|
if self.content_result:
|
|
all_errors.extend(self.content_result.get_errors())
|
|
all_warnings.extend(self.content_result.get_warnings())
|
|
|
|
lines.append(f" Errors: {len(all_errors)}")
|
|
lines.append(f" Warnings: {len(all_warnings)}")
|
|
|
|
if self.is_valid():
|
|
lines.append(" Status: PASSED ✅")
|
|
else:
|
|
lines.append(" Status: FAILED ❌")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
class SemanticValidator:
|
|
"""
|
|
Validates markdown documents against x-markitect extensions.
|
|
|
|
Complements existing SchemaValidator which handles structural AST validation.
|
|
This validator checks semantic aspects defined in x-markitect-* extensions.
|
|
|
|
Example:
|
|
>>> schema = load_schema('manpage-schema-v1.0.md')
|
|
>>> validator = SemanticValidator(schema)
|
|
>>> report = validator.validate('my-command.1.md')
|
|
>>> if not report.is_valid():
|
|
... print(report.format_text())
|
|
"""
|
|
|
|
def __init__(self, schema: Dict[str, Any]):
|
|
"""
|
|
Initialize semantic validator with a schema.
|
|
|
|
Args:
|
|
schema: JSON schema with x-markitect-* extensions
|
|
|
|
The schema can be either:
|
|
- A dict loaded from JSON
|
|
- A dict loaded from markdown with embedded JSON
|
|
- Must contain x-markitect-sections and/or x-markitect-content-control
|
|
"""
|
|
self.schema = schema
|
|
|
|
# Initialize sub-validators
|
|
self.section_validator = SectionValidator(schema)
|
|
self.content_validator = ContentValidator(schema)
|
|
|
|
# TODO: Initialize link validator when implemented
|
|
# self.link_validator = LinkValidator(schema)
|
|
|
|
def validate(self, document_path: str | Path,
|
|
check_links: bool = False) -> SemanticValidationReport:
|
|
"""
|
|
Validate a markdown document against schema extensions.
|
|
|
|
Args:
|
|
document_path: Path to markdown document to validate
|
|
check_links: Whether to validate links (may be slow)
|
|
|
|
Returns:
|
|
SemanticValidationReport with validation results
|
|
|
|
Raises:
|
|
FileNotFoundError: If document_path doesn't exist
|
|
ValueError: If document cannot be parsed
|
|
"""
|
|
document_path = Path(document_path)
|
|
|
|
if not document_path.exists():
|
|
raise FileNotFoundError(f"Document not found: {document_path}")
|
|
|
|
# Parse document
|
|
document = self._parse_document(document_path)
|
|
|
|
# Run section validation
|
|
section_result = self.section_validator.check(document)
|
|
|
|
# Run content validation
|
|
content_result = self.content_validator.check(document)
|
|
|
|
# TODO: Run link validation when implemented
|
|
# if check_links:
|
|
# link_result = self.link_validator.check(document)
|
|
# else:
|
|
# link_result = None
|
|
link_result = None
|
|
|
|
return SemanticValidationReport(
|
|
section_result=section_result,
|
|
content_result=content_result,
|
|
link_result=link_result
|
|
)
|
|
|
|
def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
|
|
"""
|
|
Parse markdown document into AST.
|
|
|
|
Args:
|
|
document_path: Path to markdown file
|
|
|
|
Returns:
|
|
Parsed MarkdownDocument object
|
|
|
|
This uses the existing markitect markdown parser.
|
|
"""
|
|
# Import here to avoid circular dependency
|
|
from markitect.document_manager import DocumentManager
|
|
|
|
# Use DocumentManager to parse the document
|
|
doc_manager = DocumentManager()
|
|
doc = doc_manager.ingest_file(document_path)
|
|
|
|
return doc
|
|
|
|
|
|
def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
|
|
"""
|
|
Load a schema from file (supports .json and .md formats).
|
|
|
|
Args:
|
|
schema_path: Path to schema file
|
|
|
|
Returns:
|
|
Schema dict with embedded JSON
|
|
|
|
Raises:
|
|
FileNotFoundError: If schema file doesn't exist
|
|
ValueError: If schema cannot be parsed
|
|
"""
|
|
schema_path = Path(schema_path)
|
|
|
|
if not schema_path.exists():
|
|
raise FileNotFoundError(f"Schema not found: {schema_path}")
|
|
|
|
if schema_path.suffix == '.json':
|
|
# Load JSON schema directly
|
|
with open(schema_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
elif schema_path.suffix == '.md':
|
|
# Load markdown schema with embedded JSON
|
|
from markitect.schema_loader import MarkdownSchemaLoader
|
|
|
|
loader = MarkdownSchemaLoader()
|
|
schema_data = loader.load_schema(schema_path)
|
|
|
|
return schema_data['schema']
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported schema format: {schema_path.suffix}")
|