Files
markitect-main/markitect/semantic_validator.py
tegwick 20c0cfece7 feat: add LinkValidator for semantic link validation (Phase 3)
Implement comprehensive link validation as part of semantic validation:

Core Features:
- Link classification: internal, external, fragment, email
- Internal link validation: fragment anchors and file paths
- External link validation: HTTP/HTTPS with configurable timeout
- Email validation: mailto: link format checking
- Fragment policy enforcement: allow/disallow fragment identifiers

Link Validator:
- markitect/validators/link_validator.py - Full link validation implementation
- Supports x-markitect-content-control.link_validation configuration
- Default: check internal links, skip external (fast)
- Opt-in external checking with --check-links flag

Integration:
- Updated SemanticValidator to include link_result in reports
- CLI already supports --check-links flag (line 1629 in cli.py)
- Link validation runs by default for internal links (fast)
- External link checking requires explicit --check-links flag

Test Coverage:
- Added 9 comprehensive tests for LinkValidator
- Tests cover: classification, broken links, fragments, email, statistics
- All 25 semantic validator tests passing (100%)

Documentation:
- Updated SCHEMA_MANAGEMENT_GUIDE.md with link validation section
- Added examples for broken links and external link checking
- Documented link types, validation rules, and configuration

Statistics Tracking:
- Links checked, internal/external/fragment/email counts
- Detailed error/warning reporting with line numbers
- Integration with existing semantic validation reporting

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:41:03 +01:00

279 lines
9.2 KiB
Python

"""
Semantic Validator for markdown documents.
Validates markdown documents against x-markitect schema extensions:
- x-markitect-sections: Section classifications (required, recommended, etc.)
- x-markitect-content-control: Content patterns and quality metrics
- Link validation: Internal and external link checking
Complements the existing SchemaValidator which handles structural AST validation.
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
import json
from markitect.validators.section_validator import (
SectionValidator,
SectionValidationResult
)
from markitect.validators.content_validator import (
ContentValidator,
ContentValidationResult
)
from markitect.validators.link_validator import (
LinkValidator,
LinkValidationResult
)
@dataclass
class SemanticValidationReport:
"""
Report of semantic validation results.
Combines results from section, content, and link validators.
"""
section_result: SectionValidationResult
content_result: Optional[ContentValidationResult] = None
link_result: Optional[LinkValidationResult] = None
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
errors = self.section_result.has_errors()
if self.content_result and hasattr(self.content_result, 'has_errors'):
errors = errors or self.content_result.has_errors()
if self.link_result and hasattr(self.link_result, 'has_errors'):
errors = errors or self.link_result.has_errors()
return errors
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
warnings = self.section_result.has_warnings()
if self.content_result and hasattr(self.content_result, 'has_warnings'):
warnings = warnings or self.content_result.has_warnings()
if self.link_result and hasattr(self.link_result, 'has_warnings'):
warnings = warnings or self.link_result.has_warnings()
return warnings
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_all_issues(self) -> List[Any]:
"""Get all issues from all validators."""
issues = list(self.section_result.issues)
if self.content_result and hasattr(self.content_result, 'issues'):
issues.extend(self.content_result.issues)
if self.link_result and hasattr(self.link_result, 'issues'):
issues.extend(self.link_result.issues)
return issues
def format_text(self) -> str:
"""Format validation report as text."""
lines = []
# Section validation
lines.append("Section Validation:")
if self.section_result.issues:
for issue in self.section_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.section_name} - {issue.message}")
else:
lines.append(" ✅ All section requirements met")
# Content validation
if self.content_result:
lines.append("")
lines.append("Content Validation:")
if self.content_result.issues:
for issue in self.content_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.section_name} - {issue.message}")
else:
lines.append(" ✅ All content requirements met")
# Link validation
if self.link_result:
lines.append("")
lines.append("Link Validation:")
if self.link_result.issues:
for issue in self.link_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.link} - {issue.message}")
else:
lines.append(f" ✅ All {self.link_result.links_checked} links valid")
# Summary
lines.append("")
lines.append("Summary:")
lines.append(f" Sections checked: {self.section_result.sections_checked}")
lines.append(f" Sections found: {self.section_result.sections_found}")
all_errors = self.section_result.get_errors()
all_warnings = self.section_result.get_warnings()
if self.content_result:
all_errors.extend(self.content_result.get_errors())
all_warnings.extend(self.content_result.get_warnings())
if self.link_result:
all_errors.extend(self.link_result.get_errors())
all_warnings.extend(self.link_result.get_warnings())
lines.append(f" Errors: {len(all_errors)}")
lines.append(f" Warnings: {len(all_warnings)}")
if self.is_valid():
lines.append(" Status: PASSED ✅")
else:
lines.append(" Status: FAILED ❌")
return "\n".join(lines)
class SemanticValidator:
"""
Validates markdown documents against x-markitect extensions.
Complements existing SchemaValidator which handles structural AST validation.
This validator checks semantic aspects defined in x-markitect-* extensions.
Example:
>>> schema = load_schema('manpage-schema-v1.0.md')
>>> validator = SemanticValidator(schema)
>>> report = validator.validate('my-command.1.md')
>>> if not report.is_valid():
... print(report.format_text())
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize semantic validator with a schema.
Args:
schema: JSON schema with x-markitect-* extensions
The schema can be either:
- A dict loaded from JSON
- A dict loaded from markdown with embedded JSON
- Must contain x-markitect-sections and/or x-markitect-content-control
"""
self.schema = schema
# Initialize sub-validators
self.section_validator = SectionValidator(schema)
self.content_validator = ContentValidator(schema)
self.link_validator = LinkValidator(schema)
def validate(self, document_path: str | Path,
check_links: bool = False) -> SemanticValidationReport:
"""
Validate a markdown document against schema extensions.
Args:
document_path: Path to markdown document to validate
check_links: Whether to validate links (may be slow)
Returns:
SemanticValidationReport with validation results
Raises:
FileNotFoundError: If document_path doesn't exist
ValueError: If document cannot be parsed
"""
document_path = Path(document_path)
if not document_path.exists():
raise FileNotFoundError(f"Document not found: {document_path}")
# Parse document
document = self._parse_document(document_path)
# Run section validation
section_result = self.section_validator.check(document)
# Run content validation
content_result = self.content_validator.check(document)
# Run link validation (if enabled)
if check_links:
link_result = self.link_validator.check(document, check_external=True)
else:
# Still check internal links by default (fast)
link_result = self.link_validator.check(document, check_external=False)
return SemanticValidationReport(
section_result=section_result,
content_result=content_result,
link_result=link_result
)
def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
"""
Parse markdown document into AST.
Args:
document_path: Path to markdown file
Returns:
Parsed MarkdownDocument object
This uses the existing markitect markdown parser.
"""
# Import here to avoid circular dependency
from markitect.document_manager import DocumentManager
# Use DocumentManager to parse the document
doc_manager = DocumentManager()
doc = doc_manager.ingest_file(document_path)
return doc
def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
"""
Load a schema from file (supports .json and .md formats).
Args:
schema_path: Path to schema file
Returns:
Schema dict with embedded JSON
Raises:
FileNotFoundError: If schema file doesn't exist
ValueError: If schema cannot be parsed
"""
schema_path = Path(schema_path)
if not schema_path.exists():
raise FileNotFoundError(f"Schema not found: {schema_path}")
if schema_path.suffix == '.json':
# Load JSON schema directly
with open(schema_path, 'r', encoding='utf-8') as f:
return json.load(f)
elif schema_path.suffix == '.md':
# Load markdown schema with embedded JSON
from markitect.schema_loader import MarkdownSchemaLoader
loader = MarkdownSchemaLoader()
schema_data = loader.load_schema(schema_path)
return schema_data['schema']
else:
raise ValueError(f"Unsupported schema format: {schema_path.suffix}")