From 20c0cfece7e6063f96e24896e3328cb5142e908a Mon Sep 17 00:00:00 2001 From: tegwick Date: Tue, 6 Jan 2026 03:41:03 +0100 Subject: [PATCH] feat: add LinkValidator for semantic link validation (Phase 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement comprehensive link validation as part of semantic validation: Core Features: - Link classification: internal, external, fragment, email - Internal link validation: fragment anchors and file paths - External link validation: HTTP/HTTPS with configurable timeout - Email validation: mailto: link format checking - Fragment policy enforcement: allow/disallow fragment identifiers Link Validator: - markitect/validators/link_validator.py - Full link validation implementation - Supports x-markitect-content-control.link_validation configuration - Default: check internal links, skip external (fast) - Opt-in external checking with --check-links flag Integration: - Updated SemanticValidator to include link_result in reports - CLI already supports --check-links flag (line 1629 in cli.py) - Link validation runs by default for internal links (fast) - External link checking requires explicit --check-links flag Test Coverage: - Added 9 comprehensive tests for LinkValidator - Tests cover: classification, broken links, fragments, email, statistics - All 25 semantic validator tests passing (100%) Documentation: - Updated SCHEMA_MANAGEMENT_GUIDE.md with link validation section - Added examples for broken links and external link checking - Documented link types, validation rules, and configuration Statistics Tracking: - Links checked, internal/external/fragment/email counts - Detailed error/warning reporting with line numbers - Integration with existing semantic validation reporting 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 --- docs/SCHEMA_MANAGEMENT_GUIDE.md | 38 ++ markitect/semantic_validator.py | 37 +- markitect/validators/__init__.py | 18 + markitect/validators/link_validator.py | 491 +++++++++++++++++++++++++ tests/test_semantic_validator.py | 255 +++++++++++++ 5 files changed, 829 insertions(+), 10 deletions(-) create mode 100644 markitect/validators/link_validator.py diff --git a/docs/SCHEMA_MANAGEMENT_GUIDE.md b/docs/SCHEMA_MANAGEMENT_GUIDE.md index b6a06821..2b370aa4 100644 --- a/docs/SCHEMA_MANAGEMENT_GUIDE.md +++ b/docs/SCHEMA_MANAGEMENT_GUIDE.md @@ -177,6 +177,9 @@ markitect validate my-document.md --schema manpage-schema-v1.0.md # Only structural validation (classic mode) markitect validate my-document.md --schema schema.json --no-semantic +# With external link checking (may be slow) +markitect validate my-document.md --schema manpage-schema-v1.0.md --check-links + # Strict mode (warnings become errors) markitect validate my-document.md --schema manpage-schema-v1.0.md --strict ``` @@ -202,6 +205,14 @@ markitect validate my-document.md --schema manpage-schema-v1.0.md --strict - **Quality Metrics**: Checks word counts, sentence counts - `min_words`, `max_words`: Word count requirements (WARNING) - `min_sentences`: Minimum sentence count (WARNING) +- **Link Validation**: Validates internal and external links (optional) + - Internal links: Checked by default when semantic validation enabled + - Fragment links (#section-name) verified to exist (ERROR if broken) + - Relative file paths checked for existence (ERROR if broken) + - External links: Opt-in with --check-links flag (may be slow) + - HTTP/HTTPS URLs validated with HEAD requests (WARNING if broken) + - Email validation: Validates mailto: link format (WARNING if invalid) + - Fragment policy: Configurable allow/disallow fragment identifiers ### Validation Output @@ -222,6 +233,9 @@ Section Validation: Content Validation: ✅ All content requirements met +Link Validation: + ✅ All 12 links valid + Summary: Sections checked: 3 Sections found: 5 @@ -271,6 +285,30 @@ $ markitect validate doc.md --schema manpage-schema-v1.0.md --strict Status: FAILED ❌ (warnings treated as errors) ``` +**Example 4: Broken Internal Link** +```bash +$ markitect validate doc.md --schema manpage-schema-v1.0.md + +Link Validation: + ❌ #nonexistent-section - Internal link target not found: #nonexistent-section + +Errors: 1 +Status: FAILED ❌ +``` + +**Example 5: External Link Validation** +```bash +# Enable external link checking (may be slow) +$ markitect validate doc.md --schema manpage-schema-v1.0.md --check-links + +Link Validation: + ✅ http://example.com - Valid + ⚠️ http://broken-link.invalid - External link unreachable: Name or service not known + +Warnings: 1 +Status: PASSED ✅ +``` + ## Schema Naming Conventions All schema filenames must follow this pattern: diff --git a/markitect/semantic_validator.py b/markitect/semantic_validator.py index 750324d0..277a5223 100644 --- a/markitect/semantic_validator.py +++ b/markitect/semantic_validator.py @@ -22,6 +22,10 @@ from markitect.validators.content_validator import ( ContentValidator, ContentValidationResult ) +from markitect.validators.link_validator import ( + LinkValidator, + LinkValidationResult +) @dataclass @@ -33,7 +37,7 @@ class SemanticValidationReport: """ section_result: SectionValidationResult content_result: Optional[ContentValidationResult] = None - link_result: Optional[Any] = None # LinkValidationResult when implemented + link_result: Optional[LinkValidationResult] = None def has_errors(self) -> bool: """Check if there are any ERROR-level issues.""" @@ -99,6 +103,17 @@ class SemanticValidationReport: else: lines.append(" ✅ All content requirements met") + # Link validation + if self.link_result: + lines.append("") + lines.append("Link Validation:") + if self.link_result.issues: + for issue in self.link_result.issues: + status = "❌" if issue.severity == 'ERROR' else "⚠️" + lines.append(f" {status} {issue.link} - {issue.message}") + else: + lines.append(f" ✅ All {self.link_result.links_checked} links valid") + # Summary lines.append("") lines.append("Summary:") @@ -112,6 +127,10 @@ class SemanticValidationReport: all_errors.extend(self.content_result.get_errors()) all_warnings.extend(self.content_result.get_warnings()) + if self.link_result: + all_errors.extend(self.link_result.get_errors()) + all_warnings.extend(self.link_result.get_warnings()) + lines.append(f" Errors: {len(all_errors)}") lines.append(f" Warnings: {len(all_warnings)}") @@ -155,9 +174,7 @@ class SemanticValidator: # Initialize sub-validators self.section_validator = SectionValidator(schema) self.content_validator = ContentValidator(schema) - - # TODO: Initialize link validator when implemented - # self.link_validator = LinkValidator(schema) + self.link_validator = LinkValidator(schema) def validate(self, document_path: str | Path, check_links: bool = False) -> SemanticValidationReport: @@ -189,12 +206,12 @@ class SemanticValidator: # Run content validation content_result = self.content_validator.check(document) - # TODO: Run link validation when implemented - # if check_links: - # link_result = self.link_validator.check(document) - # else: - # link_result = None - link_result = None + # Run link validation (if enabled) + if check_links: + link_result = self.link_validator.check(document, check_external=True) + else: + # Still check internal links by default (fast) + link_result = self.link_validator.check(document, check_external=False) return SemanticValidationReport( section_result=section_result, diff --git a/markitect/validators/__init__.py b/markitect/validators/__init__.py index feba2c6b..3e8077f4 100644 --- a/markitect/validators/__init__.py +++ b/markitect/validators/__init__.py @@ -30,6 +30,16 @@ from markitect.validators.content_validator import ( ContentTooLong, ) +from markitect.validators.link_validator import ( + LinkValidator, + LinkValidationResult, + LinkIssue, + BrokenInternalLink, + BrokenExternalLink, + FragmentNotAllowed, + InvalidEmail, +) + __all__ = [ # Section validator 'SectionValidator', @@ -47,4 +57,12 @@ __all__ = [ 'DiscouragedPattern', 'ContentTooShort', 'ContentTooLong', + # Link validator + 'LinkValidator', + 'LinkValidationResult', + 'LinkIssue', + 'BrokenInternalLink', + 'BrokenExternalLink', + 'FragmentNotAllowed', + 'InvalidEmail', ] diff --git a/markitect/validators/link_validator.py b/markitect/validators/link_validator.py new file mode 100644 index 00000000..058782e7 --- /dev/null +++ b/markitect/validators/link_validator.py @@ -0,0 +1,491 @@ +""" +Link Validator for markdown documents. + +Validates links according to x-markitect-content-control.link_validation: +- Internal links: Links to other sections or documents +- External links: HTTP/HTTPS URLs (optional, can be slow) +- Fragment identifiers: #section-name anchors +- Email links: mailto: links +""" + +from dataclasses import dataclass +from typing import List, Dict, Any, Optional +from pathlib import Path +import re +import urllib.parse +import urllib.request +from urllib.error import URLError, HTTPError + + +@dataclass +class LinkIssue: + """Base class for link validation issues.""" + link: str + severity: str # 'ERROR', 'WARNING', 'INFO' + message: str + line_number: Optional[int] = None + link_type: Optional[str] = None # 'internal', 'external', 'fragment', 'email' + + def __str__(self) -> str: + location = f" (line {self.line_number})" if self.line_number else "" + link_info = f" [{self.link_type}]" if self.link_type else "" + return f"[{self.severity}]{location}{link_info} {self.link}: {self.message}" + + +@dataclass +class BrokenInternalLink(LinkIssue): + """Internal link target not found.""" + target_section: str = "" + + +@dataclass +class BrokenExternalLink(LinkIssue): + """External link is unreachable.""" + status_code: Optional[int] = None + + +@dataclass +class FragmentNotAllowed(LinkIssue): + """Fragment identifier used when not allowed.""" + pass + + +@dataclass +class InvalidEmail(LinkIssue): + """Invalid email address in mailto link.""" + pass + + +@dataclass +class LinkValidationResult: + """Result of link validation.""" + issues: List[LinkIssue] + links_checked: int + internal_links: int = 0 + external_links: int = 0 + fragment_links: int = 0 + email_links: int = 0 + + def has_errors(self) -> bool: + """Check if there are any ERROR-level issues.""" + return any(issue.severity == 'ERROR' for issue in self.issues) + + def has_warnings(self) -> bool: + """Check if there are any WARNING-level issues.""" + return any(issue.severity == 'WARNING' for issue in self.issues) + + def is_valid(self) -> bool: + """Check if validation passed (no errors).""" + return not self.has_errors() + + def get_errors(self) -> List[LinkIssue]: + """Get all ERROR-level issues.""" + return [issue for issue in self.issues if issue.severity == 'ERROR'] + + def get_warnings(self) -> List[LinkIssue]: + """Get all WARNING-level issues.""" + return [issue for issue in self.issues if issue.severity == 'WARNING'] + + +class LinkValidator: + """ + Validates links according to x-markitect-content-control.link_validation. + + Configuration options from schema: + - check_internal: Validate internal links (default: True) + - check_external: Validate external links (default: False, can be slow) + - allow_fragments: Allow fragment identifiers (default: True) + - check_email: Validate email addresses (default: False) + - timeout: Timeout for external link checks in seconds (default: 5) + """ + + def __init__(self, schema: Dict[str, Any]): + """ + Initialize validator with a schema. + + Args: + schema: JSON schema with x-markitect-content-control.link_validation extension + """ + self.schema = schema + content_control = schema.get('x-markitect-content-control', {}) + self.link_config = content_control.get('link_validation', {}) + + # Default configuration + self.check_internal = self.link_config.get('check_internal', True) + self.check_external = self.link_config.get('check_external', False) + self.allow_fragments = self.link_config.get('allow_fragments', True) + self.check_email = self.link_config.get('check_email', False) + self.timeout = self.link_config.get('timeout', 5) + + def check(self, document: 'MarkdownDocument', + check_external: Optional[bool] = None) -> LinkValidationResult: + """ + Validate links in the document. + + Args: + document: Parsed markdown document + check_external: Override schema setting for external link checking + + Returns: + LinkValidationResult with any issues found + """ + # Override external link checking if specified + if check_external is not None: + self.check_external = check_external + + # Skip validation if no link checking is enabled + if not any([self.check_internal, self.check_external, + not self.allow_fragments, self.check_email]): + return LinkValidationResult( + issues=[], + links_checked=0 + ) + + issues = [] + stats = { + 'internal': 0, + 'external': 0, + 'fragment': 0, + 'email': 0 + } + + # Extract all links from document + links = self._extract_links(document) + + for link_info in links: + link_url = link_info['url'] + line_number = link_info.get('line_number') + + # Classify link type + link_type = self._classify_link(link_url) + stats[link_type] += 1 + + # Validate based on type + if link_type == 'internal' and self.check_internal: + link_issues = self._check_internal_link( + document, link_url, line_number + ) + issues.extend(link_issues) + + elif link_type == 'external' and self.check_external: + link_issues = self._check_external_link( + link_url, line_number + ) + issues.extend(link_issues) + + elif link_type == 'fragment': + # Check if fragments are allowed + if not self.allow_fragments: + issues.append(FragmentNotAllowed( + link=link_url, + severity='WARNING', + message='Fragment identifiers are not allowed', + line_number=line_number, + link_type='fragment' + )) + # Also validate fragment targets if internal checking is enabled + elif self.check_internal: + link_issues = self._check_internal_link( + document, link_url, line_number + ) + issues.extend(link_issues) + + elif link_type == 'email' and self.check_email: + link_issues = self._check_email_link( + link_url, line_number + ) + issues.extend(link_issues) + + return LinkValidationResult( + issues=issues, + links_checked=len(links), + internal_links=stats['internal'], + external_links=stats['external'], + fragment_links=stats['fragment'], + email_links=stats['email'] + ) + + def _extract_links(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]: + """ + Extract all links from markdown document. + + Args: + document: Parsed markdown document + + Returns: + List of dicts with 'url' and optional 'line_number' + """ + links = [] + + # Try to use document's link extraction if available + if hasattr(document, 'extract_links'): + return document.extract_links() + + # Fallback: Extract from raw content + if hasattr(document, 'content'): + content = document.content + elif hasattr(document, 'raw_content'): + content = document.raw_content + else: + return [] + + # Regex patterns for markdown links + # [text](url) format + inline_pattern = r'\[([^\]]+)\]\(([^)]+)\)' + # [text][ref] and [ref]: url formats + ref_pattern = r'^\[([^\]]+)\]:\s*(.+)$' + + line_number = 1 + for line in content.split('\n'): + # Find inline links + for match in re.finditer(inline_pattern, line): + url = match.group(2) + links.append({ + 'url': url.strip(), + 'line_number': line_number + }) + + # Find reference-style link definitions + ref_match = re.match(ref_pattern, line.strip()) + if ref_match: + url = ref_match.group(2) + links.append({ + 'url': url.strip(), + 'line_number': line_number + }) + + line_number += 1 + + return links + + def _classify_link(self, url: str) -> str: + """ + Classify link type. + + Args: + url: Link URL + + Returns: + 'internal', 'external', 'fragment', or 'email' + """ + url = url.strip() + + # Email links + if url.startswith('mailto:'): + return 'email' + + # Fragment-only links (#section) + if url.startswith('#'): + return 'fragment' + + # External links (http/https) + if url.startswith(('http://', 'https://', '//')): + return 'external' + + # Everything else is considered internal + # (relative paths, absolute paths, etc.) + return 'internal' + + def _check_internal_link(self, document: 'MarkdownDocument', + url: str, line_number: Optional[int]) -> List[LinkIssue]: + """ + Check internal link validity. + + Args: + document: The document being validated + url: Internal link URL + line_number: Line number where link appears + + Returns: + List of issues found + """ + issues = [] + + # Parse URL to extract path and fragment + parsed = urllib.parse.urlparse(url) + path = parsed.path + fragment = parsed.fragment + + # Check if fragment points to existing section + if fragment: + section_found = self._check_fragment_exists(document, fragment) + if not section_found: + issues.append(BrokenInternalLink( + link=url, + severity='ERROR', + message=f'Internal link target not found: #{fragment}', + line_number=line_number, + link_type='internal', + target_section=fragment + )) + + # Check if path points to existing file (if it's a file path) + if path and not path.startswith('#'): + # Try to resolve relative to document's directory + if hasattr(document, 'file_path'): + doc_dir = Path(document.file_path).parent + target_path = (doc_dir / path).resolve() + + if not target_path.exists(): + issues.append(BrokenInternalLink( + link=url, + severity='ERROR', + message=f'Internal link file not found: {path}', + line_number=line_number, + link_type='internal', + target_section=path + )) + + return issues + + def _check_fragment_exists(self, document: 'MarkdownDocument', + fragment: str) -> bool: + """ + Check if a fragment identifier exists in the document. + + Args: + document: The document to search + fragment: Fragment identifier (without #) + + Returns: + True if fragment exists, False otherwise + """ + # Try to get headings from document + if hasattr(document, 'get_headings_by_level'): + # Check all heading levels + for level in range(1, 7): + headings = document.get_headings_by_level(level) + for heading in headings: + # Get heading text + if isinstance(heading, dict): + heading_text = heading.get('content', '') + else: + heading_text = str(heading) + + # Convert heading to fragment ID + # (lowercase, spaces to hyphens, remove special chars) + heading_id = self._heading_to_fragment_id(heading_text) + + if heading_id == fragment.lower(): + return True + + return False + + def _heading_to_fragment_id(self, heading: str) -> str: + """ + Convert heading text to fragment ID. + + Args: + heading: Heading text + + Returns: + Fragment ID (lowercase, hyphens for spaces) + """ + # Lowercase + fragment = heading.lower() + # Remove special characters except spaces and hyphens + fragment = re.sub(r'[^\w\s-]', '', fragment) + # Replace spaces with hyphens + fragment = re.sub(r'\s+', '-', fragment) + # Remove multiple consecutive hyphens + fragment = re.sub(r'-+', '-', fragment) + # Strip leading/trailing hyphens + fragment = fragment.strip('-') + + return fragment + + def _check_external_link(self, url: str, + line_number: Optional[int]) -> List[LinkIssue]: + """ + Check external link validity (HTTP HEAD request). + + Args: + url: External link URL + line_number: Line number where link appears + + Returns: + List of issues found + """ + issues = [] + + # Normalize URL (add https:// if starts with //) + if url.startswith('//'): + url = 'https:' + url + + try: + # Use HEAD request for efficiency + request = urllib.request.Request(url, method='HEAD') + request.add_header('User-Agent', 'MarkiTect-LinkValidator/1.0') + + with urllib.request.urlopen(request, timeout=self.timeout) as response: + # 2xx and 3xx are considered valid + if response.status >= 400: + issues.append(BrokenExternalLink( + link=url, + severity='WARNING', + message=f'External link returned status {response.status}', + line_number=line_number, + link_type='external', + status_code=response.status + )) + + except HTTPError as e: + issues.append(BrokenExternalLink( + link=url, + severity='WARNING', + message=f'External link returned HTTP {e.code}', + line_number=line_number, + link_type='external', + status_code=e.code + )) + + except URLError as e: + issues.append(BrokenExternalLink( + link=url, + severity='WARNING', + message=f'External link unreachable: {e.reason}', + line_number=line_number, + link_type='external' + )) + + except Exception as e: + issues.append(BrokenExternalLink( + link=url, + severity='WARNING', + message=f'External link check failed: {str(e)}', + line_number=line_number, + link_type='external' + )) + + return issues + + def _check_email_link(self, url: str, + line_number: Optional[int]) -> List[LinkIssue]: + """ + Check email link validity. + + Args: + url: Email link (mailto:...) + line_number: Line number where link appears + + Returns: + List of issues found + """ + issues = [] + + # Extract email address + email = url.replace('mailto:', '', 1).strip() + + # Basic email validation regex + email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' + + if not re.match(email_pattern, email): + issues.append(InvalidEmail( + link=url, + severity='WARNING', + message=f'Invalid email address format', + line_number=line_number, + link_type='email' + )) + + return issues diff --git a/tests/test_semantic_validator.py b/tests/test_semantic_validator.py index ed6700e8..15fe060e 100644 --- a/tests/test_semantic_validator.py +++ b/tests/test_semantic_validator.py @@ -27,6 +27,13 @@ from markitect.validators.content_validator import ( ContentTooShort, ContentTooLong ) +from markitect.validators.link_validator import ( + LinkValidator, + BrokenInternalLink, + BrokenExternalLink, + FragmentNotAllowed, + InvalidEmail +) class TestSectionValidator: @@ -504,3 +511,251 @@ class TestContentValidator: assert not result.has_errors() assert not result.has_warnings() assert len(result.issues) == 0 + + +class TestLinkValidator: + """Test link validation functionality.""" + + def test_link_classification(self): + """Test that links are correctly classified by type.""" + schema = {'x-markitect-content-control': {}} + validator = LinkValidator(schema) + + assert validator._classify_link('http://example.com') == 'external' + assert validator._classify_link('https://example.com') == 'external' + assert validator._classify_link('//example.com') == 'external' + assert validator._classify_link('mailto:test@example.com') == 'email' + assert validator._classify_link('#section-name') == 'fragment' + assert validator._classify_link('../other-doc.md') == 'internal' + assert validator._classify_link('/absolute/path.md') == 'internal' + + def test_broken_internal_link_fragment(self): + """Test detection of broken internal fragment links.""" + schema = { + 'x-markitect-content-control': { + 'link_validation': { + 'check_internal': True + } + } + } + + validator = LinkValidator(schema) + + # Create mock document with headings + class MockDocument: + def get_headings_by_level(self, level): + if level == 2: + return [ + {'content': 'Introduction', 'level': 2}, + {'content': 'Getting Started', 'level': 2} + ] + return [] + + def extract_links(self): + return [ + {'url': '#introduction', 'line_number': 10}, + {'url': '#nonexistent-section', 'line_number': 15} + ] + + doc = MockDocument() + result = validator.check(doc) + + # Should detect broken fragment + assert not result.is_valid() + assert result.has_errors() + assert len(result.get_errors()) == 1 + + error = result.get_errors()[0] + assert isinstance(error, BrokenInternalLink) + assert 'nonexistent-section' in error.link + assert error.line_number == 15 + + def test_fragment_not_allowed(self): + """Test detection of fragment links when not allowed.""" + schema = { + 'x-markitect-content-control': { + 'link_validation': { + 'allow_fragments': False + } + } + } + + validator = LinkValidator(schema) + + # Create mock document with fragment link + class MockDocument: + def extract_links(self): + return [{'url': '#section', 'line_number': 5}] + + doc = MockDocument() + result = validator.check(doc) + + # Should have warning + assert result.is_valid() # Warnings don't fail + assert result.has_warnings() + + warning = result.get_warnings()[0] + assert isinstance(warning, FragmentNotAllowed) + + def test_invalid_email(self): + """Test detection of invalid email addresses.""" + schema = { + 'x-markitect-content-control': { + 'link_validation': { + 'check_email': True + } + } + } + + validator = LinkValidator(schema) + + # Create mock document with invalid email + class MockDocument: + def extract_links(self): + return [ + {'url': 'mailto:valid@example.com', 'line_number': 5}, + {'url': 'mailto:invalid-email', 'line_number': 10} + ] + + doc = MockDocument() + result = validator.check(doc) + + # Should have one warning for invalid email + assert result.is_valid() # Email validation uses warnings + assert result.has_warnings() + assert len(result.get_warnings()) == 1 + + warning = result.get_warnings()[0] + assert isinstance(warning, InvalidEmail) + assert 'invalid-email' in warning.link + + def test_link_extraction_from_content(self): + """Test extraction of links from markdown content.""" + schema = {'x-markitect-content-control': {}} + validator = LinkValidator(schema) + + # Create mock document with raw content + class MockDocument: + content = """# Test Document + +This is a [link](http://example.com) in text. + +Another [internal link](../docs/other.md). + +Reference style [link][ref]. + +[ref]: https://example.org +""" + + doc = MockDocument() + links = validator._extract_links(doc) + + # Should extract all links + assert len(links) == 3 + urls = [link['url'] for link in links] + assert 'http://example.com' in urls + assert '../docs/other.md' in urls + assert 'https://example.org' in urls + + def test_heading_to_fragment_conversion(self): + """Test conversion of headings to fragment IDs.""" + schema = {'x-markitect-content-control': {}} + validator = LinkValidator(schema) + + # Test various heading formats + assert validator._heading_to_fragment_id('Getting Started') == 'getting-started' + assert validator._heading_to_fragment_id('API Reference') == 'api-reference' + assert validator._heading_to_fragment_id('FAQ (Frequently Asked)') == 'faq-frequently-asked' + assert validator._heading_to_fragment_id(' Spaces Around ') == 'spaces-around' + + def test_no_link_validation_when_disabled(self): + """Test that link validation is skipped when all checks disabled.""" + schema = { + 'x-markitect-content-control': { + 'link_validation': { + 'check_internal': False, + 'check_external': False, + 'allow_fragments': True, + 'check_email': False + } + } + } + + validator = LinkValidator(schema) + + class MockDocument: + def extract_links(self): + return [ + {'url': '#broken-fragment'}, + {'url': 'http://broken-link.invalid'} + ] + + doc = MockDocument() + result = validator.check(doc) + + # Should skip all validation + assert result.is_valid() + assert len(result.issues) == 0 + assert result.links_checked == 0 + + def test_external_link_validation_opt_in(self): + """Test that external link validation requires explicit opt-in.""" + schema = { + 'x-markitect-content-control': { + 'link_validation': { + 'check_external': False # Disabled by default + } + } + } + + validator = LinkValidator(schema) + + class MockDocument: + def extract_links(self): + return [{'url': 'http://definitely-broken-12345.invalid'}] + + doc = MockDocument() + + # Without check_external override + result = validator.check(doc) + assert result.is_valid() + assert len(result.issues) == 0 + + # With check_external override + result = validator.check(doc, check_external=True) + # This would check external links (may fail or timeout) + # We don't assert on the result since it depends on network + + def test_link_validation_statistics(self): + """Test that link validation tracks statistics.""" + schema = { + 'x-markitect-content-control': { + 'link_validation': { + 'check_internal': True + } + } + } + + validator = LinkValidator(schema) + + class MockDocument: + def get_headings_by_level(self, level): + return [] + + def extract_links(self): + return [ + {'url': '#fragment'}, + {'url': 'http://example.com'}, + {'url': '../internal.md'}, + {'url': 'mailto:test@example.com'} + ] + + doc = MockDocument() + result = validator.check(doc) + + # Check statistics + assert result.links_checked == 4 + assert result.fragment_links == 1 + assert result.external_links == 1 + assert result.internal_links == 1 + assert result.email_links == 1