Implement comprehensive link validation as part of semantic validation: Core Features: - Link classification: internal, external, fragment, email - Internal link validation: fragment anchors and file paths - External link validation: HTTP/HTTPS with configurable timeout - Email validation: mailto: link format checking - Fragment policy enforcement: allow/disallow fragment identifiers Link Validator: - markitect/validators/link_validator.py - Full link validation implementation - Supports x-markitect-content-control.link_validation configuration - Default: check internal links, skip external (fast) - Opt-in external checking with --check-links flag Integration: - Updated SemanticValidator to include link_result in reports - CLI already supports --check-links flag (line 1629 in cli.py) - Link validation runs by default for internal links (fast) - External link checking requires explicit --check-links flag Test Coverage: - Added 9 comprehensive tests for LinkValidator - Tests cover: classification, broken links, fragments, email, statistics - All 25 semantic validator tests passing (100%) Documentation: - Updated SCHEMA_MANAGEMENT_GUIDE.md with link validation section - Added examples for broken links and external link checking - Documented link types, validation rules, and configuration Statistics Tracking: - Links checked, internal/external/fragment/email counts - Detailed error/warning reporting with line numbers - Integration with existing semantic validation reporting 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
279 lines
9.2 KiB
Python
279 lines
9.2 KiB
Python
"""
|
|
Semantic Validator for markdown documents.
|
|
|
|
Validates markdown documents against x-markitect schema extensions:
|
|
- x-markitect-sections: Section classifications (required, recommended, etc.)
|
|
- x-markitect-content-control: Content patterns and quality metrics
|
|
- Link validation: Internal and external link checking
|
|
|
|
Complements the existing SchemaValidator which handles structural AST validation.
|
|
"""
|
|
|
|
from dataclasses import dataclass
|
|
from typing import List, Dict, Any, Optional
|
|
from pathlib import Path
|
|
import json
|
|
|
|
from markitect.validators.section_validator import (
|
|
SectionValidator,
|
|
SectionValidationResult
|
|
)
|
|
from markitect.validators.content_validator import (
|
|
ContentValidator,
|
|
ContentValidationResult
|
|
)
|
|
from markitect.validators.link_validator import (
|
|
LinkValidator,
|
|
LinkValidationResult
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class SemanticValidationReport:
|
|
"""
|
|
Report of semantic validation results.
|
|
|
|
Combines results from section, content, and link validators.
|
|
"""
|
|
section_result: SectionValidationResult
|
|
content_result: Optional[ContentValidationResult] = None
|
|
link_result: Optional[LinkValidationResult] = None
|
|
|
|
def has_errors(self) -> bool:
|
|
"""Check if there are any ERROR-level issues."""
|
|
errors = self.section_result.has_errors()
|
|
|
|
if self.content_result and hasattr(self.content_result, 'has_errors'):
|
|
errors = errors or self.content_result.has_errors()
|
|
|
|
if self.link_result and hasattr(self.link_result, 'has_errors'):
|
|
errors = errors or self.link_result.has_errors()
|
|
|
|
return errors
|
|
|
|
def has_warnings(self) -> bool:
|
|
"""Check if there are any WARNING-level issues."""
|
|
warnings = self.section_result.has_warnings()
|
|
|
|
if self.content_result and hasattr(self.content_result, 'has_warnings'):
|
|
warnings = warnings or self.content_result.has_warnings()
|
|
|
|
if self.link_result and hasattr(self.link_result, 'has_warnings'):
|
|
warnings = warnings or self.link_result.has_warnings()
|
|
|
|
return warnings
|
|
|
|
def is_valid(self) -> bool:
|
|
"""Check if validation passed (no errors)."""
|
|
return not self.has_errors()
|
|
|
|
def get_all_issues(self) -> List[Any]:
|
|
"""Get all issues from all validators."""
|
|
issues = list(self.section_result.issues)
|
|
|
|
if self.content_result and hasattr(self.content_result, 'issues'):
|
|
issues.extend(self.content_result.issues)
|
|
|
|
if self.link_result and hasattr(self.link_result, 'issues'):
|
|
issues.extend(self.link_result.issues)
|
|
|
|
return issues
|
|
|
|
def format_text(self) -> str:
|
|
"""Format validation report as text."""
|
|
lines = []
|
|
|
|
# Section validation
|
|
lines.append("Section Validation:")
|
|
if self.section_result.issues:
|
|
for issue in self.section_result.issues:
|
|
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
|
lines.append(f" {status} {issue.section_name} - {issue.message}")
|
|
else:
|
|
lines.append(" ✅ All section requirements met")
|
|
|
|
# Content validation
|
|
if self.content_result:
|
|
lines.append("")
|
|
lines.append("Content Validation:")
|
|
if self.content_result.issues:
|
|
for issue in self.content_result.issues:
|
|
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
|
lines.append(f" {status} {issue.section_name} - {issue.message}")
|
|
else:
|
|
lines.append(" ✅ All content requirements met")
|
|
|
|
# Link validation
|
|
if self.link_result:
|
|
lines.append("")
|
|
lines.append("Link Validation:")
|
|
if self.link_result.issues:
|
|
for issue in self.link_result.issues:
|
|
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
|
lines.append(f" {status} {issue.link} - {issue.message}")
|
|
else:
|
|
lines.append(f" ✅ All {self.link_result.links_checked} links valid")
|
|
|
|
# Summary
|
|
lines.append("")
|
|
lines.append("Summary:")
|
|
lines.append(f" Sections checked: {self.section_result.sections_checked}")
|
|
lines.append(f" Sections found: {self.section_result.sections_found}")
|
|
|
|
all_errors = self.section_result.get_errors()
|
|
all_warnings = self.section_result.get_warnings()
|
|
|
|
if self.content_result:
|
|
all_errors.extend(self.content_result.get_errors())
|
|
all_warnings.extend(self.content_result.get_warnings())
|
|
|
|
if self.link_result:
|
|
all_errors.extend(self.link_result.get_errors())
|
|
all_warnings.extend(self.link_result.get_warnings())
|
|
|
|
lines.append(f" Errors: {len(all_errors)}")
|
|
lines.append(f" Warnings: {len(all_warnings)}")
|
|
|
|
if self.is_valid():
|
|
lines.append(" Status: PASSED ✅")
|
|
else:
|
|
lines.append(" Status: FAILED ❌")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
class SemanticValidator:
|
|
"""
|
|
Validates markdown documents against x-markitect extensions.
|
|
|
|
Complements existing SchemaValidator which handles structural AST validation.
|
|
This validator checks semantic aspects defined in x-markitect-* extensions.
|
|
|
|
Example:
|
|
>>> schema = load_schema('manpage-schema-v1.0.md')
|
|
>>> validator = SemanticValidator(schema)
|
|
>>> report = validator.validate('my-command.1.md')
|
|
>>> if not report.is_valid():
|
|
... print(report.format_text())
|
|
"""
|
|
|
|
def __init__(self, schema: Dict[str, Any]):
|
|
"""
|
|
Initialize semantic validator with a schema.
|
|
|
|
Args:
|
|
schema: JSON schema with x-markitect-* extensions
|
|
|
|
The schema can be either:
|
|
- A dict loaded from JSON
|
|
- A dict loaded from markdown with embedded JSON
|
|
- Must contain x-markitect-sections and/or x-markitect-content-control
|
|
"""
|
|
self.schema = schema
|
|
|
|
# Initialize sub-validators
|
|
self.section_validator = SectionValidator(schema)
|
|
self.content_validator = ContentValidator(schema)
|
|
self.link_validator = LinkValidator(schema)
|
|
|
|
def validate(self, document_path: str | Path,
|
|
check_links: bool = False) -> SemanticValidationReport:
|
|
"""
|
|
Validate a markdown document against schema extensions.
|
|
|
|
Args:
|
|
document_path: Path to markdown document to validate
|
|
check_links: Whether to validate links (may be slow)
|
|
|
|
Returns:
|
|
SemanticValidationReport with validation results
|
|
|
|
Raises:
|
|
FileNotFoundError: If document_path doesn't exist
|
|
ValueError: If document cannot be parsed
|
|
"""
|
|
document_path = Path(document_path)
|
|
|
|
if not document_path.exists():
|
|
raise FileNotFoundError(f"Document not found: {document_path}")
|
|
|
|
# Parse document
|
|
document = self._parse_document(document_path)
|
|
|
|
# Run section validation
|
|
section_result = self.section_validator.check(document)
|
|
|
|
# Run content validation
|
|
content_result = self.content_validator.check(document)
|
|
|
|
# Run link validation (if enabled)
|
|
if check_links:
|
|
link_result = self.link_validator.check(document, check_external=True)
|
|
else:
|
|
# Still check internal links by default (fast)
|
|
link_result = self.link_validator.check(document, check_external=False)
|
|
|
|
return SemanticValidationReport(
|
|
section_result=section_result,
|
|
content_result=content_result,
|
|
link_result=link_result
|
|
)
|
|
|
|
def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
|
|
"""
|
|
Parse markdown document into AST.
|
|
|
|
Args:
|
|
document_path: Path to markdown file
|
|
|
|
Returns:
|
|
Parsed MarkdownDocument object
|
|
|
|
This uses the existing markitect markdown parser.
|
|
"""
|
|
# Import here to avoid circular dependency
|
|
from markitect.document_manager import DocumentManager
|
|
|
|
# Use DocumentManager to parse the document
|
|
doc_manager = DocumentManager()
|
|
doc = doc_manager.ingest_file(document_path)
|
|
|
|
return doc
|
|
|
|
|
|
def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
|
|
"""
|
|
Load a schema from file (supports .json and .md formats).
|
|
|
|
Args:
|
|
schema_path: Path to schema file
|
|
|
|
Returns:
|
|
Schema dict with embedded JSON
|
|
|
|
Raises:
|
|
FileNotFoundError: If schema file doesn't exist
|
|
ValueError: If schema cannot be parsed
|
|
"""
|
|
schema_path = Path(schema_path)
|
|
|
|
if not schema_path.exists():
|
|
raise FileNotFoundError(f"Schema not found: {schema_path}")
|
|
|
|
if schema_path.suffix == '.json':
|
|
# Load JSON schema directly
|
|
with open(schema_path, 'r', encoding='utf-8') as f:
|
|
return json.load(f)
|
|
|
|
elif schema_path.suffix == '.md':
|
|
# Load markdown schema with embedded JSON
|
|
from markitect.schema_loader import MarkdownSchemaLoader
|
|
|
|
loader = MarkdownSchemaLoader()
|
|
schema_data = loader.load_schema(schema_path)
|
|
|
|
return schema_data['schema']
|
|
|
|
else:
|
|
raise ValueError(f"Unsupported schema format: {schema_path.suffix}")
|