Files
markitect-main/markitect/semantic_validator.py
tegwick a969c5de47 feat: add semantic document validator for x-markitect extensions
Implements semantic validation to complement existing structural validation:

Phase 1 & 2 Complete:
- SemanticValidator: Main validator orchestrating sub-validators
- SectionValidator: Enforces section classifications (required, recommended,
  optional, discouraged, improper) from x-markitect-sections
- ContentValidator: Validates content patterns, forbidden patterns, and
  quality metrics (word counts, sentence counts) from x-markitect-content-control

Features:
- Pattern matching with regex for required/forbidden/discouraged patterns
- Word count and sentence count validation
- Detailed error reporting with severity levels (ERROR, WARNING)
- Support for section alternatives (e.g., FLAGS vs OPTIONS)
- Comprehensive test coverage (16 tests, 100% passing)

Architecture:
- Complements existing SchemaValidator (structural AST validation)
- Clean separation: validators/ package for modular validators
- Semantic validation focuses on x-markitect-* extensions
- LinkValidator planned for Phase 3 (optional --check-links)

Next: Phase 4 - CLI integration to enhance 'markitect validate' command

Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:24:32 +01:00

262 lines
8.4 KiB
Python

"""
Semantic Validator for markdown documents.
Validates markdown documents against x-markitect schema extensions:
- x-markitect-sections: Section classifications (required, recommended, etc.)
- x-markitect-content-control: Content patterns and quality metrics
- Link validation: Internal and external link checking
Complements the existing SchemaValidator which handles structural AST validation.
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
import json
from markitect.validators.section_validator import (
SectionValidator,
SectionValidationResult
)
from markitect.validators.content_validator import (
ContentValidator,
ContentValidationResult
)
@dataclass
class SemanticValidationReport:
"""
Report of semantic validation results.
Combines results from section, content, and link validators.
"""
section_result: SectionValidationResult
content_result: Optional[ContentValidationResult] = None
link_result: Optional[Any] = None # LinkValidationResult when implemented
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
errors = self.section_result.has_errors()
if self.content_result and hasattr(self.content_result, 'has_errors'):
errors = errors or self.content_result.has_errors()
if self.link_result and hasattr(self.link_result, 'has_errors'):
errors = errors or self.link_result.has_errors()
return errors
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
warnings = self.section_result.has_warnings()
if self.content_result and hasattr(self.content_result, 'has_warnings'):
warnings = warnings or self.content_result.has_warnings()
if self.link_result and hasattr(self.link_result, 'has_warnings'):
warnings = warnings or self.link_result.has_warnings()
return warnings
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_all_issues(self) -> List[Any]:
"""Get all issues from all validators."""
issues = list(self.section_result.issues)
if self.content_result and hasattr(self.content_result, 'issues'):
issues.extend(self.content_result.issues)
if self.link_result and hasattr(self.link_result, 'issues'):
issues.extend(self.link_result.issues)
return issues
def format_text(self) -> str:
"""Format validation report as text."""
lines = []
# Section validation
lines.append("Section Validation:")
if self.section_result.issues:
for issue in self.section_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.section_name} - {issue.message}")
else:
lines.append(" ✅ All section requirements met")
# Content validation
if self.content_result:
lines.append("")
lines.append("Content Validation:")
if self.content_result.issues:
for issue in self.content_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.section_name} - {issue.message}")
else:
lines.append(" ✅ All content requirements met")
# Summary
lines.append("")
lines.append("Summary:")
lines.append(f" Sections checked: {self.section_result.sections_checked}")
lines.append(f" Sections found: {self.section_result.sections_found}")
all_errors = self.section_result.get_errors()
all_warnings = self.section_result.get_warnings()
if self.content_result:
all_errors.extend(self.content_result.get_errors())
all_warnings.extend(self.content_result.get_warnings())
lines.append(f" Errors: {len(all_errors)}")
lines.append(f" Warnings: {len(all_warnings)}")
if self.is_valid():
lines.append(" Status: PASSED ✅")
else:
lines.append(" Status: FAILED ❌")
return "\n".join(lines)
class SemanticValidator:
"""
Validates markdown documents against x-markitect extensions.
Complements existing SchemaValidator which handles structural AST validation.
This validator checks semantic aspects defined in x-markitect-* extensions.
Example:
>>> schema = load_schema('manpage-schema-v1.0.md')
>>> validator = SemanticValidator(schema)
>>> report = validator.validate('my-command.1.md')
>>> if not report.is_valid():
... print(report.format_text())
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize semantic validator with a schema.
Args:
schema: JSON schema with x-markitect-* extensions
The schema can be either:
- A dict loaded from JSON
- A dict loaded from markdown with embedded JSON
- Must contain x-markitect-sections and/or x-markitect-content-control
"""
self.schema = schema
# Initialize sub-validators
self.section_validator = SectionValidator(schema)
self.content_validator = ContentValidator(schema)
# TODO: Initialize link validator when implemented
# self.link_validator = LinkValidator(schema)
def validate(self, document_path: str | Path,
check_links: bool = False) -> SemanticValidationReport:
"""
Validate a markdown document against schema extensions.
Args:
document_path: Path to markdown document to validate
check_links: Whether to validate links (may be slow)
Returns:
SemanticValidationReport with validation results
Raises:
FileNotFoundError: If document_path doesn't exist
ValueError: If document cannot be parsed
"""
document_path = Path(document_path)
if not document_path.exists():
raise FileNotFoundError(f"Document not found: {document_path}")
# Parse document
document = self._parse_document(document_path)
# Run section validation
section_result = self.section_validator.check(document)
# Run content validation
content_result = self.content_validator.check(document)
# TODO: Run link validation when implemented
# if check_links:
# link_result = self.link_validator.check(document)
# else:
# link_result = None
link_result = None
return SemanticValidationReport(
section_result=section_result,
content_result=content_result,
link_result=link_result
)
def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
"""
Parse markdown document into AST.
Args:
document_path: Path to markdown file
Returns:
Parsed MarkdownDocument object
This uses the existing markitect markdown parser.
"""
# Import here to avoid circular dependency
from markitect.document_manager import DocumentManager
# Use DocumentManager to parse the document
doc_manager = DocumentManager()
doc = doc_manager.ingest_file(document_path)
return doc
def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
"""
Load a schema from file (supports .json and .md formats).
Args:
schema_path: Path to schema file
Returns:
Schema dict with embedded JSON
Raises:
FileNotFoundError: If schema file doesn't exist
ValueError: If schema cannot be parsed
"""
schema_path = Path(schema_path)
if not schema_path.exists():
raise FileNotFoundError(f"Schema not found: {schema_path}")
if schema_path.suffix == '.json':
# Load JSON schema directly
with open(schema_path, 'r', encoding='utf-8') as f:
return json.load(f)
elif schema_path.suffix == '.md':
# Load markdown schema with embedded JSON
from markitect.schema_loader import MarkdownSchemaLoader
loader = MarkdownSchemaLoader()
schema_data = loader.load_schema(schema_path)
return schema_data['schema']
else:
raise ValueError(f"Unsupported schema format: {schema_path.suffix}")