feat: add LinkValidator for semantic link validation (Phase 3)

Implement comprehensive link validation as part of semantic validation:

Core Features:
- Link classification: internal, external, fragment, email
- Internal link validation: fragment anchors and file paths
- External link validation: HTTP/HTTPS with configurable timeout
- Email validation: mailto: link format checking
- Fragment policy enforcement: allow/disallow fragment identifiers

Link Validator:
- markitect/validators/link_validator.py - Full link validation implementation
- Supports x-markitect-content-control.link_validation configuration
- Default: check internal links, skip external (fast)
- Opt-in external checking with --check-links flag

Integration:
- Updated SemanticValidator to include link_result in reports
- CLI already supports --check-links flag (line 1629 in cli.py)
- Link validation runs by default for internal links (fast)
- External link checking requires explicit --check-links flag

Test Coverage:
- Added 9 comprehensive tests for LinkValidator
- Tests cover: classification, broken links, fragments, email, statistics
- All 25 semantic validator tests passing (100%)

Documentation:
- Updated SCHEMA_MANAGEMENT_GUIDE.md with link validation section
- Added examples for broken links and external link checking
- Documented link types, validation rules, and configuration

Statistics Tracking:
- Links checked, internal/external/fragment/email counts
- Detailed error/warning reporting with line numbers
- Integration with existing semantic validation reporting

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-06 03:41:03 +01:00
parent 0d78837a53
commit 20c0cfece7
5 changed files with 829 additions and 10 deletions

View File

@@ -27,6 +27,13 @@ from markitect.validators.content_validator import (
ContentTooShort,
ContentTooLong
)
from markitect.validators.link_validator import (
LinkValidator,
BrokenInternalLink,
BrokenExternalLink,
FragmentNotAllowed,
InvalidEmail
)
class TestSectionValidator:
@@ -504,3 +511,251 @@ class TestContentValidator:
assert not result.has_errors()
assert not result.has_warnings()
assert len(result.issues) == 0
class TestLinkValidator:
"""Test link validation functionality."""
def test_link_classification(self):
"""Test that links are correctly classified by type."""
schema = {'x-markitect-content-control': {}}
validator = LinkValidator(schema)
assert validator._classify_link('http://example.com') == 'external'
assert validator._classify_link('https://example.com') == 'external'
assert validator._classify_link('//example.com') == 'external'
assert validator._classify_link('mailto:test@example.com') == 'email'
assert validator._classify_link('#section-name') == 'fragment'
assert validator._classify_link('../other-doc.md') == 'internal'
assert validator._classify_link('/absolute/path.md') == 'internal'
def test_broken_internal_link_fragment(self):
"""Test detection of broken internal fragment links."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_internal': True
}
}
}
validator = LinkValidator(schema)
# Create mock document with headings
class MockDocument:
def get_headings_by_level(self, level):
if level == 2:
return [
{'content': 'Introduction', 'level': 2},
{'content': 'Getting Started', 'level': 2}
]
return []
def extract_links(self):
return [
{'url': '#introduction', 'line_number': 10},
{'url': '#nonexistent-section', 'line_number': 15}
]
doc = MockDocument()
result = validator.check(doc)
# Should detect broken fragment
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, BrokenInternalLink)
assert 'nonexistent-section' in error.link
assert error.line_number == 15
def test_fragment_not_allowed(self):
"""Test detection of fragment links when not allowed."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'allow_fragments': False
}
}
}
validator = LinkValidator(schema)
# Create mock document with fragment link
class MockDocument:
def extract_links(self):
return [{'url': '#section', 'line_number': 5}]
doc = MockDocument()
result = validator.check(doc)
# Should have warning
assert result.is_valid() # Warnings don't fail
assert result.has_warnings()
warning = result.get_warnings()[0]
assert isinstance(warning, FragmentNotAllowed)
def test_invalid_email(self):
"""Test detection of invalid email addresses."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_email': True
}
}
}
validator = LinkValidator(schema)
# Create mock document with invalid email
class MockDocument:
def extract_links(self):
return [
{'url': 'mailto:valid@example.com', 'line_number': 5},
{'url': 'mailto:invalid-email', 'line_number': 10}
]
doc = MockDocument()
result = validator.check(doc)
# Should have one warning for invalid email
assert result.is_valid() # Email validation uses warnings
assert result.has_warnings()
assert len(result.get_warnings()) == 1
warning = result.get_warnings()[0]
assert isinstance(warning, InvalidEmail)
assert 'invalid-email' in warning.link
def test_link_extraction_from_content(self):
"""Test extraction of links from markdown content."""
schema = {'x-markitect-content-control': {}}
validator = LinkValidator(schema)
# Create mock document with raw content
class MockDocument:
content = """# Test Document
This is a [link](http://example.com) in text.
Another [internal link](../docs/other.md).
Reference style [link][ref].
[ref]: https://example.org
"""
doc = MockDocument()
links = validator._extract_links(doc)
# Should extract all links
assert len(links) == 3
urls = [link['url'] for link in links]
assert 'http://example.com' in urls
assert '../docs/other.md' in urls
assert 'https://example.org' in urls
def test_heading_to_fragment_conversion(self):
"""Test conversion of headings to fragment IDs."""
schema = {'x-markitect-content-control': {}}
validator = LinkValidator(schema)
# Test various heading formats
assert validator._heading_to_fragment_id('Getting Started') == 'getting-started'
assert validator._heading_to_fragment_id('API Reference') == 'api-reference'
assert validator._heading_to_fragment_id('FAQ (Frequently Asked)') == 'faq-frequently-asked'
assert validator._heading_to_fragment_id(' Spaces Around ') == 'spaces-around'
def test_no_link_validation_when_disabled(self):
"""Test that link validation is skipped when all checks disabled."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_internal': False,
'check_external': False,
'allow_fragments': True,
'check_email': False
}
}
}
validator = LinkValidator(schema)
class MockDocument:
def extract_links(self):
return [
{'url': '#broken-fragment'},
{'url': 'http://broken-link.invalid'}
]
doc = MockDocument()
result = validator.check(doc)
# Should skip all validation
assert result.is_valid()
assert len(result.issues) == 0
assert result.links_checked == 0
def test_external_link_validation_opt_in(self):
"""Test that external link validation requires explicit opt-in."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_external': False # Disabled by default
}
}
}
validator = LinkValidator(schema)
class MockDocument:
def extract_links(self):
return [{'url': 'http://definitely-broken-12345.invalid'}]
doc = MockDocument()
# Without check_external override
result = validator.check(doc)
assert result.is_valid()
assert len(result.issues) == 0
# With check_external override
result = validator.check(doc, check_external=True)
# This would check external links (may fail or timeout)
# We don't assert on the result since it depends on network
def test_link_validation_statistics(self):
"""Test that link validation tracks statistics."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_internal': True
}
}
}
validator = LinkValidator(schema)
class MockDocument:
def get_headings_by_level(self, level):
return []
def extract_links(self):
return [
{'url': '#fragment'},
{'url': 'http://example.com'},
{'url': '../internal.md'},
{'url': 'mailto:test@example.com'}
]
doc = MockDocument()
result = validator.check(doc)
# Check statistics
assert result.links_checked == 4
assert result.fragment_links == 1
assert result.external_links == 1
assert result.internal_links == 1
assert result.email_links == 1