Files
markitect-main/markitect/validators/link_validator.py
tegwick 20c0cfece7 feat: add LinkValidator for semantic link validation (Phase 3)
Implement comprehensive link validation as part of semantic validation:

Core Features:
- Link classification: internal, external, fragment, email
- Internal link validation: fragment anchors and file paths
- External link validation: HTTP/HTTPS with configurable timeout
- Email validation: mailto: link format checking
- Fragment policy enforcement: allow/disallow fragment identifiers

Link Validator:
- markitect/validators/link_validator.py - Full link validation implementation
- Supports x-markitect-content-control.link_validation configuration
- Default: check internal links, skip external (fast)
- Opt-in external checking with --check-links flag

Integration:
- Updated SemanticValidator to include link_result in reports
- CLI already supports --check-links flag (line 1629 in cli.py)
- Link validation runs by default for internal links (fast)
- External link checking requires explicit --check-links flag

Test Coverage:
- Added 9 comprehensive tests for LinkValidator
- Tests cover: classification, broken links, fragments, email, statistics
- All 25 semantic validator tests passing (100%)

Documentation:
- Updated SCHEMA_MANAGEMENT_GUIDE.md with link validation section
- Added examples for broken links and external link checking
- Documented link types, validation rules, and configuration

Statistics Tracking:
- Links checked, internal/external/fragment/email counts
- Detailed error/warning reporting with line numbers
- Integration with existing semantic validation reporting

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:41:03 +01:00

492 lines
16 KiB
Python

"""
Link Validator for markdown documents.
Validates links according to x-markitect-content-control.link_validation:
- Internal links: Links to other sections or documents
- External links: HTTP/HTTPS URLs (optional, can be slow)
- Fragment identifiers: #section-name anchors
- Email links: mailto: links
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
import re
import urllib.parse
import urllib.request
from urllib.error import URLError, HTTPError
@dataclass
class LinkIssue:
"""Base class for link validation issues."""
link: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
line_number: Optional[int] = None
link_type: Optional[str] = None # 'internal', 'external', 'fragment', 'email'
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
link_info = f" [{self.link_type}]" if self.link_type else ""
return f"[{self.severity}]{location}{link_info} {self.link}: {self.message}"
@dataclass
class BrokenInternalLink(LinkIssue):
"""Internal link target not found."""
target_section: str = ""
@dataclass
class BrokenExternalLink(LinkIssue):
"""External link is unreachable."""
status_code: Optional[int] = None
@dataclass
class FragmentNotAllowed(LinkIssue):
"""Fragment identifier used when not allowed."""
pass
@dataclass
class InvalidEmail(LinkIssue):
"""Invalid email address in mailto link."""
pass
@dataclass
class LinkValidationResult:
"""Result of link validation."""
issues: List[LinkIssue]
links_checked: int
internal_links: int = 0
external_links: int = 0
fragment_links: int = 0
email_links: int = 0
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[LinkIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[LinkIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class LinkValidator:
"""
Validates links according to x-markitect-content-control.link_validation.
Configuration options from schema:
- check_internal: Validate internal links (default: True)
- check_external: Validate external links (default: False, can be slow)
- allow_fragments: Allow fragment identifiers (default: True)
- check_email: Validate email addresses (default: False)
- timeout: Timeout for external link checks in seconds (default: 5)
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-content-control.link_validation extension
"""
self.schema = schema
content_control = schema.get('x-markitect-content-control', {})
self.link_config = content_control.get('link_validation', {})
# Default configuration
self.check_internal = self.link_config.get('check_internal', True)
self.check_external = self.link_config.get('check_external', False)
self.allow_fragments = self.link_config.get('allow_fragments', True)
self.check_email = self.link_config.get('check_email', False)
self.timeout = self.link_config.get('timeout', 5)
def check(self, document: 'MarkdownDocument',
check_external: Optional[bool] = None) -> LinkValidationResult:
"""
Validate links in the document.
Args:
document: Parsed markdown document
check_external: Override schema setting for external link checking
Returns:
LinkValidationResult with any issues found
"""
# Override external link checking if specified
if check_external is not None:
self.check_external = check_external
# Skip validation if no link checking is enabled
if not any([self.check_internal, self.check_external,
not self.allow_fragments, self.check_email]):
return LinkValidationResult(
issues=[],
links_checked=0
)
issues = []
stats = {
'internal': 0,
'external': 0,
'fragment': 0,
'email': 0
}
# Extract all links from document
links = self._extract_links(document)
for link_info in links:
link_url = link_info['url']
line_number = link_info.get('line_number')
# Classify link type
link_type = self._classify_link(link_url)
stats[link_type] += 1
# Validate based on type
if link_type == 'internal' and self.check_internal:
link_issues = self._check_internal_link(
document, link_url, line_number
)
issues.extend(link_issues)
elif link_type == 'external' and self.check_external:
link_issues = self._check_external_link(
link_url, line_number
)
issues.extend(link_issues)
elif link_type == 'fragment':
# Check if fragments are allowed
if not self.allow_fragments:
issues.append(FragmentNotAllowed(
link=link_url,
severity='WARNING',
message='Fragment identifiers are not allowed',
line_number=line_number,
link_type='fragment'
))
# Also validate fragment targets if internal checking is enabled
elif self.check_internal:
link_issues = self._check_internal_link(
document, link_url, line_number
)
issues.extend(link_issues)
elif link_type == 'email' and self.check_email:
link_issues = self._check_email_link(
link_url, line_number
)
issues.extend(link_issues)
return LinkValidationResult(
issues=issues,
links_checked=len(links),
internal_links=stats['internal'],
external_links=stats['external'],
fragment_links=stats['fragment'],
email_links=stats['email']
)
def _extract_links(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
"""
Extract all links from markdown document.
Args:
document: Parsed markdown document
Returns:
List of dicts with 'url' and optional 'line_number'
"""
links = []
# Try to use document's link extraction if available
if hasattr(document, 'extract_links'):
return document.extract_links()
# Fallback: Extract from raw content
if hasattr(document, 'content'):
content = document.content
elif hasattr(document, 'raw_content'):
content = document.raw_content
else:
return []
# Regex patterns for markdown links
# [text](url) format
inline_pattern = r'\[([^\]]+)\]\(([^)]+)\)'
# [text][ref] and [ref]: url formats
ref_pattern = r'^\[([^\]]+)\]:\s*(.+)$'
line_number = 1
for line in content.split('\n'):
# Find inline links
for match in re.finditer(inline_pattern, line):
url = match.group(2)
links.append({
'url': url.strip(),
'line_number': line_number
})
# Find reference-style link definitions
ref_match = re.match(ref_pattern, line.strip())
if ref_match:
url = ref_match.group(2)
links.append({
'url': url.strip(),
'line_number': line_number
})
line_number += 1
return links
def _classify_link(self, url: str) -> str:
"""
Classify link type.
Args:
url: Link URL
Returns:
'internal', 'external', 'fragment', or 'email'
"""
url = url.strip()
# Email links
if url.startswith('mailto:'):
return 'email'
# Fragment-only links (#section)
if url.startswith('#'):
return 'fragment'
# External links (http/https)
if url.startswith(('http://', 'https://', '//')):
return 'external'
# Everything else is considered internal
# (relative paths, absolute paths, etc.)
return 'internal'
def _check_internal_link(self, document: 'MarkdownDocument',
url: str, line_number: Optional[int]) -> List[LinkIssue]:
"""
Check internal link validity.
Args:
document: The document being validated
url: Internal link URL
line_number: Line number where link appears
Returns:
List of issues found
"""
issues = []
# Parse URL to extract path and fragment
parsed = urllib.parse.urlparse(url)
path = parsed.path
fragment = parsed.fragment
# Check if fragment points to existing section
if fragment:
section_found = self._check_fragment_exists(document, fragment)
if not section_found:
issues.append(BrokenInternalLink(
link=url,
severity='ERROR',
message=f'Internal link target not found: #{fragment}',
line_number=line_number,
link_type='internal',
target_section=fragment
))
# Check if path points to existing file (if it's a file path)
if path and not path.startswith('#'):
# Try to resolve relative to document's directory
if hasattr(document, 'file_path'):
doc_dir = Path(document.file_path).parent
target_path = (doc_dir / path).resolve()
if not target_path.exists():
issues.append(BrokenInternalLink(
link=url,
severity='ERROR',
message=f'Internal link file not found: {path}',
line_number=line_number,
link_type='internal',
target_section=path
))
return issues
def _check_fragment_exists(self, document: 'MarkdownDocument',
fragment: str) -> bool:
"""
Check if a fragment identifier exists in the document.
Args:
document: The document to search
fragment: Fragment identifier (without #)
Returns:
True if fragment exists, False otherwise
"""
# Try to get headings from document
if hasattr(document, 'get_headings_by_level'):
# Check all heading levels
for level in range(1, 7):
headings = document.get_headings_by_level(level)
for heading in headings:
# Get heading text
if isinstance(heading, dict):
heading_text = heading.get('content', '')
else:
heading_text = str(heading)
# Convert heading to fragment ID
# (lowercase, spaces to hyphens, remove special chars)
heading_id = self._heading_to_fragment_id(heading_text)
if heading_id == fragment.lower():
return True
return False
def _heading_to_fragment_id(self, heading: str) -> str:
"""
Convert heading text to fragment ID.
Args:
heading: Heading text
Returns:
Fragment ID (lowercase, hyphens for spaces)
"""
# Lowercase
fragment = heading.lower()
# Remove special characters except spaces and hyphens
fragment = re.sub(r'[^\w\s-]', '', fragment)
# Replace spaces with hyphens
fragment = re.sub(r'\s+', '-', fragment)
# Remove multiple consecutive hyphens
fragment = re.sub(r'-+', '-', fragment)
# Strip leading/trailing hyphens
fragment = fragment.strip('-')
return fragment
def _check_external_link(self, url: str,
line_number: Optional[int]) -> List[LinkIssue]:
"""
Check external link validity (HTTP HEAD request).
Args:
url: External link URL
line_number: Line number where link appears
Returns:
List of issues found
"""
issues = []
# Normalize URL (add https:// if starts with //)
if url.startswith('//'):
url = 'https:' + url
try:
# Use HEAD request for efficiency
request = urllib.request.Request(url, method='HEAD')
request.add_header('User-Agent', 'MarkiTect-LinkValidator/1.0')
with urllib.request.urlopen(request, timeout=self.timeout) as response:
# 2xx and 3xx are considered valid
if response.status >= 400:
issues.append(BrokenExternalLink(
link=url,
severity='WARNING',
message=f'External link returned status {response.status}',
line_number=line_number,
link_type='external',
status_code=response.status
))
except HTTPError as e:
issues.append(BrokenExternalLink(
link=url,
severity='WARNING',
message=f'External link returned HTTP {e.code}',
line_number=line_number,
link_type='external',
status_code=e.code
))
except URLError as e:
issues.append(BrokenExternalLink(
link=url,
severity='WARNING',
message=f'External link unreachable: {e.reason}',
line_number=line_number,
link_type='external'
))
except Exception as e:
issues.append(BrokenExternalLink(
link=url,
severity='WARNING',
message=f'External link check failed: {str(e)}',
line_number=line_number,
link_type='external'
))
return issues
def _check_email_link(self, url: str,
line_number: Optional[int]) -> List[LinkIssue]:
"""
Check email link validity.
Args:
url: Email link (mailto:...)
line_number: Line number where link appears
Returns:
List of issues found
"""
issues = []
# Extract email address
email = url.replace('mailto:', '', 1).strip()
# Basic email validation regex
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
if not re.match(email_pattern, email):
issues.append(InvalidEmail(
link=url,
severity='WARNING',
message=f'Invalid email address format',
line_number=line_number,
link_type='email'
))
return issues