Files
markitect-main/tests/test_semantic_validator.py
tegwick 20c0cfece7 feat: add LinkValidator for semantic link validation (Phase 3)
Implement comprehensive link validation as part of semantic validation:

Core Features:
- Link classification: internal, external, fragment, email
- Internal link validation: fragment anchors and file paths
- External link validation: HTTP/HTTPS with configurable timeout
- Email validation: mailto: link format checking
- Fragment policy enforcement: allow/disallow fragment identifiers

Link Validator:
- markitect/validators/link_validator.py - Full link validation implementation
- Supports x-markitect-content-control.link_validation configuration
- Default: check internal links, skip external (fast)
- Opt-in external checking with --check-links flag

Integration:
- Updated SemanticValidator to include link_result in reports
- CLI already supports --check-links flag (line 1629 in cli.py)
- Link validation runs by default for internal links (fast)
- External link checking requires explicit --check-links flag

Test Coverage:
- Added 9 comprehensive tests for LinkValidator
- Tests cover: classification, broken links, fragments, email, statistics
- All 25 semantic validator tests passing (100%)

Documentation:
- Updated SCHEMA_MANAGEMENT_GUIDE.md with link validation section
- Added examples for broken links and external link checking
- Documented link types, validation rules, and configuration

Statistics Tracking:
- Links checked, internal/external/fragment/email counts
- Detailed error/warning reporting with line numbers
- Integration with existing semantic validation reporting

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-01-06 03:41:03 +01:00

762 lines
24 KiB
Python

"""
Tests for SemanticValidator.
Tests semantic validation of markdown documents against x-markitect extensions.
"""
import pytest
from pathlib import Path
import tempfile
import json
from markitect.semantic_validator import (
SemanticValidator,
SemanticValidationReport,
load_schema_from_path
)
from markitect.validators.section_validator import (
SectionValidator,
SectionMissing,
SectionImproper
)
from markitect.validators.content_validator import (
ContentValidator,
PatternMissing,
ForbiddenPattern,
DiscouragedPattern,
ContentTooShort,
ContentTooLong
)
from markitect.validators.link_validator import (
LinkValidator,
BrokenInternalLink,
BrokenExternalLink,
FragmentNotAllowed,
InvalidEmail
)
class TestSectionValidator:
"""Test section validation functionality."""
def test_required_section_missing(self):
"""Test that missing required sections are detected as errors."""
schema = {
'x-markitect-sections': {
'SYNOPSIS': {
'classification': 'required',
'heading_level': 2,
'error_message': 'SYNOPSIS section is mandatory'
}
}
}
validator = SectionValidator(schema)
# Create a mock document without SYNOPSIS
class MockDocument:
def get_headings_by_level(self, level):
return ['DESCRIPTION', 'EXAMPLES']
doc = MockDocument()
result = validator.check(doc)
# Should have one error
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, SectionMissing)
assert error.section_name == 'SYNOPSIS'
assert error.severity == 'ERROR'
assert 'mandatory' in error.message
def test_improper_section_present(self):
"""Test that improper sections are detected as errors."""
schema = {
'x-markitect-sections': {
'INTERNAL_NOTES': {
'classification': 'improper',
'heading_level': 2,
'error_message': 'Internal notes must not appear in published docs'
}
}
}
validator = SectionValidator(schema)
# Create a mock document with INTERNAL_NOTES
class MockDocument:
def get_headings_by_level(self, level):
return [
{
'content': 'INTERNAL_NOTES',
'level': 2,
'line_number': 25
}
]
doc = MockDocument()
result = validator.check(doc)
# Should have one error
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, SectionImproper)
assert error.section_name == 'INTERNAL_NOTES'
assert error.severity == 'ERROR'
assert error.line_number == 25
def test_recommended_section_missing(self):
"""Test that missing recommended sections generate warnings."""
schema = {
'x-markitect-sections': {
'EXAMPLES': {
'classification': 'recommended',
'heading_level': 2,
'warning_if_missing': 'Examples improve documentation quality'
}
}
}
validator = SectionValidator(schema)
# Create a mock document without EXAMPLES
class MockDocument:
def get_headings_by_level(self, level):
return ['SYNOPSIS', 'DESCRIPTION']
doc = MockDocument()
result = validator.check(doc)
# Should pass validation (warnings don't fail)
assert result.is_valid()
assert not result.has_errors()
assert result.has_warnings()
assert len(result.get_warnings()) == 1
warning = result.get_warnings()[0]
assert warning.section_name == 'EXAMPLES'
assert warning.severity == 'WARNING'
def test_all_required_sections_present(self):
"""Test that validation passes when all required sections present."""
schema = {
'x-markitect-sections': {
'SYNOPSIS': {
'classification': 'required',
'heading_level': 2
},
'DESCRIPTION': {
'classification': 'required',
'heading_level': 2
}
}
}
validator = SectionValidator(schema)
# Create a mock document with all required sections
class MockDocument:
def get_headings_by_level(self, level):
return [
{'content': 'SYNOPSIS', 'level': 2},
{'content': 'DESCRIPTION', 'level': 2},
{'content': 'EXAMPLES', 'level': 2}
]
doc = MockDocument()
result = validator.check(doc)
# Should pass
assert result.is_valid()
assert not result.has_errors()
assert not result.has_warnings()
assert len(result.issues) == 0
def test_section_alternatives(self):
"""Test that alternative section names are recognized."""
schema = {
'x-markitect-sections': {
'OPTIONS': {
'classification': 'required',
'heading_level': 2,
'alternatives': ['FLAGS', 'COMMAND OPTIONS']
}
}
}
validator = SectionValidator(schema)
# Document uses alternative name 'FLAGS'
class MockDocument:
def get_headings_by_level(self, level):
return [{'content': 'FLAGS', 'level': 2}]
doc = MockDocument()
result = validator.check(doc)
# Should pass (alternative is accepted)
assert result.is_valid()
assert not result.has_errors()
class TestSemanticValidator:
"""Test complete semantic validation."""
def test_validator_initialization(self):
"""Test that validator initializes correctly."""
schema = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'x-markitect-sections': {
'SYNOPSIS': {'classification': 'required', 'heading_level': 2}
}
}
validator = SemanticValidator(schema)
assert validator.schema == schema
assert validator.section_validator is not None
def test_validation_report_formatting(self):
"""Test that validation reports format correctly."""
from markitect.validators.section_validator import (
SectionValidationResult,
SectionMissing
)
section_result = SectionValidationResult(
issues=[
SectionMissing(
section_name='SYNOPSIS',
severity='ERROR',
message='SYNOPSIS is required',
classification='required'
)
],
sections_checked=2,
sections_found=1
)
report = SemanticValidationReport(section_result=section_result)
# Check report properties
assert report.has_errors()
assert not report.is_valid()
# Check text formatting
text = report.format_text()
assert 'Section Validation:' in text
assert 'SYNOPSIS' in text
assert 'Errors: 1' in text
assert 'FAILED' in text
def test_load_json_schema(self, tmp_path):
"""Test loading a JSON schema file."""
schema_file = tmp_path / "test-schema.json"
schema_data = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'title': 'Test Schema',
'x-markitect-sections': {
'SYNOPSIS': {'classification': 'required', 'heading_level': 2}
}
}
schema_file.write_text(json.dumps(schema_data, indent=2))
loaded_schema = load_schema_from_path(schema_file)
assert loaded_schema == schema_data
assert 'x-markitect-sections' in loaded_schema
def test_schema_not_found(self):
"""Test that missing schema file raises error."""
with pytest.raises(FileNotFoundError):
load_schema_from_path('/nonexistent/schema.json')
def test_unsupported_schema_format(self, tmp_path):
"""Test that unsupported format raises error."""
schema_file = tmp_path / "schema.xml"
schema_file.write_text('<schema></schema>')
with pytest.raises(ValueError, match="Unsupported schema format"):
load_schema_from_path(schema_file)
class TestContentValidator:
"""Test content validation functionality."""
def test_required_pattern_missing(self):
"""Test that missing required patterns are detected."""
schema = {
'x-markitect-content-control': {
'synopsis': {
'required_patterns': [
r'\*\*[a-z][a-z0-9-]*\*\*' # Bold command name
]
}
}
}
validator = ContentValidator(schema)
# Create mock document without bold command
class MockDocument:
def get_section(self, name):
if name == 'SYNOPSIS':
return {
'name': 'SYNOPSIS',
'content': 'command [options] arguments' # No bold
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should have one error
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, PatternMissing)
assert error.section_name == 'SYNOPSIS'
assert error.severity == 'ERROR'
def test_forbidden_pattern_found(self):
"""Test that forbidden patterns are detected."""
schema = {
'x-markitect-content-control': {
'description': {
'forbidden_patterns': [
r'\bTODO\b',
r'\bFIXME\b'
]
}
}
}
validator = ContentValidator(schema)
# Create mock document with forbidden pattern
class MockDocument:
def get_section(self, name):
if name == 'DESCRIPTION':
return {
'name': 'DESCRIPTION',
'content': 'This is a description. TODO: Add more details.'
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should have one error
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, ForbiddenPattern)
assert error.section_name == 'DESCRIPTION'
assert 'TODO' in error.matched_text
def test_discouraged_pattern_warning(self):
"""Test that discouraged patterns generate warnings."""
schema = {
'x-markitect-content-control': {
'description': {
'discouraged_patterns': [
r'\bWIP\b'
]
}
}
}
validator = ContentValidator(schema)
# Create mock document with discouraged pattern
class MockDocument:
def get_section(self, name):
if name == 'DESCRIPTION':
return {
'name': 'DESCRIPTION',
'content': 'This is WIP content.'
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should pass (warnings don't fail)
assert result.is_valid()
assert not result.has_errors()
assert result.has_warnings()
warning = result.get_warnings()[0]
assert isinstance(warning, DiscouragedPattern)
assert warning.severity == 'WARNING'
def test_content_too_short(self):
"""Test word count validation - too short."""
schema = {
'x-markitect-content-control': {
'description': {
'content_quality': {
'min_words': 50,
'max_words': 1000
}
}
}
}
validator = ContentValidator(schema)
# Create mock document with short content
class MockDocument:
def get_section(self, name):
if name == 'DESCRIPTION':
return {
'name': 'DESCRIPTION',
'content': 'Short description.' # Only 2 words
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should have warning
assert result.is_valid() # Warnings don't fail
assert result.has_warnings()
warning = result.get_warnings()[0]
assert isinstance(warning, ContentTooShort)
assert warning.actual == 2
assert warning.required == 50
def test_content_too_long(self):
"""Test word count validation - too long."""
schema = {
'x-markitect-content-control': {
'synopsis': {
'content_quality': {
'min_words': 5,
'max_words': 20
}
}
}
}
validator = ContentValidator(schema)
# Create mock document with long content
class MockDocument:
def get_section(self, name):
if name == 'SYNOPSIS':
return {
'name': 'SYNOPSIS',
'content': ' '.join(['word'] * 50) # 50 words
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should have warning
assert result.is_valid()
assert result.has_warnings()
warning = result.get_warnings()[0]
assert isinstance(warning, ContentTooLong)
assert warning.actual == 50
assert warning.limit == 20
def test_all_content_requirements_met(self):
"""Test that validation passes when all requirements met."""
schema = {
'x-markitect-content-control': {
'synopsis': {
'required_patterns': [
r'\*\*[a-z]+\*\*'
],
'content_quality': {
'min_words': 5,
'max_words': 50
}
}
}
}
validator = ContentValidator(schema)
# Create valid document
class MockDocument:
def get_section(self, name):
if name == 'SYNOPSIS':
return {
'name': 'SYNOPSIS',
'content': '**command** [options] arguments and more words here'
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should pass
assert result.is_valid()
assert not result.has_errors()
assert not result.has_warnings()
assert len(result.issues) == 0
class TestLinkValidator:
"""Test link validation functionality."""
def test_link_classification(self):
"""Test that links are correctly classified by type."""
schema = {'x-markitect-content-control': {}}
validator = LinkValidator(schema)
assert validator._classify_link('http://example.com') == 'external'
assert validator._classify_link('https://example.com') == 'external'
assert validator._classify_link('//example.com') == 'external'
assert validator._classify_link('mailto:test@example.com') == 'email'
assert validator._classify_link('#section-name') == 'fragment'
assert validator._classify_link('../other-doc.md') == 'internal'
assert validator._classify_link('/absolute/path.md') == 'internal'
def test_broken_internal_link_fragment(self):
"""Test detection of broken internal fragment links."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_internal': True
}
}
}
validator = LinkValidator(schema)
# Create mock document with headings
class MockDocument:
def get_headings_by_level(self, level):
if level == 2:
return [
{'content': 'Introduction', 'level': 2},
{'content': 'Getting Started', 'level': 2}
]
return []
def extract_links(self):
return [
{'url': '#introduction', 'line_number': 10},
{'url': '#nonexistent-section', 'line_number': 15}
]
doc = MockDocument()
result = validator.check(doc)
# Should detect broken fragment
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, BrokenInternalLink)
assert 'nonexistent-section' in error.link
assert error.line_number == 15
def test_fragment_not_allowed(self):
"""Test detection of fragment links when not allowed."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'allow_fragments': False
}
}
}
validator = LinkValidator(schema)
# Create mock document with fragment link
class MockDocument:
def extract_links(self):
return [{'url': '#section', 'line_number': 5}]
doc = MockDocument()
result = validator.check(doc)
# Should have warning
assert result.is_valid() # Warnings don't fail
assert result.has_warnings()
warning = result.get_warnings()[0]
assert isinstance(warning, FragmentNotAllowed)
def test_invalid_email(self):
"""Test detection of invalid email addresses."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_email': True
}
}
}
validator = LinkValidator(schema)
# Create mock document with invalid email
class MockDocument:
def extract_links(self):
return [
{'url': 'mailto:valid@example.com', 'line_number': 5},
{'url': 'mailto:invalid-email', 'line_number': 10}
]
doc = MockDocument()
result = validator.check(doc)
# Should have one warning for invalid email
assert result.is_valid() # Email validation uses warnings
assert result.has_warnings()
assert len(result.get_warnings()) == 1
warning = result.get_warnings()[0]
assert isinstance(warning, InvalidEmail)
assert 'invalid-email' in warning.link
def test_link_extraction_from_content(self):
"""Test extraction of links from markdown content."""
schema = {'x-markitect-content-control': {}}
validator = LinkValidator(schema)
# Create mock document with raw content
class MockDocument:
content = """# Test Document
This is a [link](http://example.com) in text.
Another [internal link](../docs/other.md).
Reference style [link][ref].
[ref]: https://example.org
"""
doc = MockDocument()
links = validator._extract_links(doc)
# Should extract all links
assert len(links) == 3
urls = [link['url'] for link in links]
assert 'http://example.com' in urls
assert '../docs/other.md' in urls
assert 'https://example.org' in urls
def test_heading_to_fragment_conversion(self):
"""Test conversion of headings to fragment IDs."""
schema = {'x-markitect-content-control': {}}
validator = LinkValidator(schema)
# Test various heading formats
assert validator._heading_to_fragment_id('Getting Started') == 'getting-started'
assert validator._heading_to_fragment_id('API Reference') == 'api-reference'
assert validator._heading_to_fragment_id('FAQ (Frequently Asked)') == 'faq-frequently-asked'
assert validator._heading_to_fragment_id(' Spaces Around ') == 'spaces-around'
def test_no_link_validation_when_disabled(self):
"""Test that link validation is skipped when all checks disabled."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_internal': False,
'check_external': False,
'allow_fragments': True,
'check_email': False
}
}
}
validator = LinkValidator(schema)
class MockDocument:
def extract_links(self):
return [
{'url': '#broken-fragment'},
{'url': 'http://broken-link.invalid'}
]
doc = MockDocument()
result = validator.check(doc)
# Should skip all validation
assert result.is_valid()
assert len(result.issues) == 0
assert result.links_checked == 0
def test_external_link_validation_opt_in(self):
"""Test that external link validation requires explicit opt-in."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_external': False # Disabled by default
}
}
}
validator = LinkValidator(schema)
class MockDocument:
def extract_links(self):
return [{'url': 'http://definitely-broken-12345.invalid'}]
doc = MockDocument()
# Without check_external override
result = validator.check(doc)
assert result.is_valid()
assert len(result.issues) == 0
# With check_external override
result = validator.check(doc, check_external=True)
# This would check external links (may fail or timeout)
# We don't assert on the result since it depends on network
def test_link_validation_statistics(self):
"""Test that link validation tracks statistics."""
schema = {
'x-markitect-content-control': {
'link_validation': {
'check_internal': True
}
}
}
validator = LinkValidator(schema)
class MockDocument:
def get_headings_by_level(self, level):
return []
def extract_links(self):
return [
{'url': '#fragment'},
{'url': 'http://example.com'},
{'url': '../internal.md'},
{'url': 'mailto:test@example.com'}
]
doc = MockDocument()
result = validator.check(doc)
# Check statistics
assert result.links_checked == 4
assert result.fragment_links == 1
assert result.external_links == 1
assert result.internal_links == 1
assert result.email_links == 1