""" Link Validator for markdown documents. Validates links according to x-markitect-content-control.link_validation: - Internal links: Links to other sections or documents - External links: HTTP/HTTPS URLs (optional, can be slow) - Fragment identifiers: #section-name anchors - Email links: mailto: links """ from dataclasses import dataclass from typing import List, Dict, Any, Optional from pathlib import Path import re import urllib.parse import urllib.request from urllib.error import URLError, HTTPError @dataclass class LinkIssue: """Base class for link validation issues.""" link: str severity: str # 'ERROR', 'WARNING', 'INFO' message: str line_number: Optional[int] = None link_type: Optional[str] = None # 'internal', 'external', 'fragment', 'email' def __str__(self) -> str: location = f" (line {self.line_number})" if self.line_number else "" link_info = f" [{self.link_type}]" if self.link_type else "" return f"[{self.severity}]{location}{link_info} {self.link}: {self.message}" @dataclass class BrokenInternalLink(LinkIssue): """Internal link target not found.""" target_section: str = "" @dataclass class BrokenExternalLink(LinkIssue): """External link is unreachable.""" status_code: Optional[int] = None @dataclass class FragmentNotAllowed(LinkIssue): """Fragment identifier used when not allowed.""" pass @dataclass class InvalidEmail(LinkIssue): """Invalid email address in mailto link.""" pass @dataclass class LinkValidationResult: """Result of link validation.""" issues: List[LinkIssue] links_checked: int internal_links: int = 0 external_links: int = 0 fragment_links: int = 0 email_links: int = 0 def has_errors(self) -> bool: """Check if there are any ERROR-level issues.""" return any(issue.severity == 'ERROR' for issue in self.issues) def has_warnings(self) -> bool: """Check if there are any WARNING-level issues.""" return any(issue.severity == 'WARNING' for issue in self.issues) def is_valid(self) -> bool: """Check if validation passed (no errors).""" return not self.has_errors() def get_errors(self) -> List[LinkIssue]: """Get all ERROR-level issues.""" return [issue for issue in self.issues if issue.severity == 'ERROR'] def get_warnings(self) -> List[LinkIssue]: """Get all WARNING-level issues.""" return [issue for issue in self.issues if issue.severity == 'WARNING'] class LinkValidator: """ Validates links according to x-markitect-content-control.link_validation. Configuration options from schema: - check_internal: Validate internal links (default: True) - check_external: Validate external links (default: False, can be slow) - allow_fragments: Allow fragment identifiers (default: True) - check_email: Validate email addresses (default: False) - timeout: Timeout for external link checks in seconds (default: 5) """ def __init__(self, schema: Dict[str, Any]): """ Initialize validator with a schema. Args: schema: JSON schema with x-markitect-content-control.link_validation extension """ self.schema = schema content_control = schema.get('x-markitect-content-control', {}) self.link_config = content_control.get('link_validation', {}) # Default configuration self.check_internal = self.link_config.get('check_internal', True) self.check_external = self.link_config.get('check_external', False) self.allow_fragments = self.link_config.get('allow_fragments', True) self.check_email = self.link_config.get('check_email', False) self.timeout = self.link_config.get('timeout', 5) def check(self, document: 'MarkdownDocument', check_external: Optional[bool] = None) -> LinkValidationResult: """ Validate links in the document. Args: document: Parsed markdown document check_external: Override schema setting for external link checking Returns: LinkValidationResult with any issues found """ # Override external link checking if specified if check_external is not None: self.check_external = check_external # Skip validation if no link checking is enabled if not any([self.check_internal, self.check_external, not self.allow_fragments, self.check_email]): return LinkValidationResult( issues=[], links_checked=0 ) issues = [] stats = { 'internal': 0, 'external': 0, 'fragment': 0, 'email': 0 } # Extract all links from document links = self._extract_links(document) for link_info in links: link_url = link_info['url'] line_number = link_info.get('line_number') # Classify link type link_type = self._classify_link(link_url) stats[link_type] += 1 # Validate based on type if link_type == 'internal' and self.check_internal: link_issues = self._check_internal_link( document, link_url, line_number ) issues.extend(link_issues) elif link_type == 'external' and self.check_external: link_issues = self._check_external_link( link_url, line_number ) issues.extend(link_issues) elif link_type == 'fragment': # Check if fragments are allowed if not self.allow_fragments: issues.append(FragmentNotAllowed( link=link_url, severity='WARNING', message='Fragment identifiers are not allowed', line_number=line_number, link_type='fragment' )) # Also validate fragment targets if internal checking is enabled elif self.check_internal: link_issues = self._check_internal_link( document, link_url, line_number ) issues.extend(link_issues) elif link_type == 'email' and self.check_email: link_issues = self._check_email_link( link_url, line_number ) issues.extend(link_issues) return LinkValidationResult( issues=issues, links_checked=len(links), internal_links=stats['internal'], external_links=stats['external'], fragment_links=stats['fragment'], email_links=stats['email'] ) def _extract_links(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]: """ Extract all links from markdown document. Args: document: Parsed markdown document Returns: List of dicts with 'url' and optional 'line_number' """ links = [] # Try to use document's link extraction if available if hasattr(document, 'extract_links'): return document.extract_links() # Fallback: Extract from raw content if hasattr(document, 'content'): content = document.content elif hasattr(document, 'raw_content'): content = document.raw_content else: return [] # Regex patterns for markdown links # [text](url) format inline_pattern = r'\[([^\]]+)\]\(([^)]+)\)' # [text][ref] and [ref]: url formats ref_pattern = r'^\[([^\]]+)\]:\s*(.+)$' line_number = 1 for line in content.split('\n'): # Find inline links for match in re.finditer(inline_pattern, line): url = match.group(2) links.append({ 'url': url.strip(), 'line_number': line_number }) # Find reference-style link definitions ref_match = re.match(ref_pattern, line.strip()) if ref_match: url = ref_match.group(2) links.append({ 'url': url.strip(), 'line_number': line_number }) line_number += 1 return links def _classify_link(self, url: str) -> str: """ Classify link type. Args: url: Link URL Returns: 'internal', 'external', 'fragment', or 'email' """ url = url.strip() # Email links if url.startswith('mailto:'): return 'email' # Fragment-only links (#section) if url.startswith('#'): return 'fragment' # External links (http/https) if url.startswith(('http://', 'https://', '//')): return 'external' # Everything else is considered internal # (relative paths, absolute paths, etc.) return 'internal' def _check_internal_link(self, document: 'MarkdownDocument', url: str, line_number: Optional[int]) -> List[LinkIssue]: """ Check internal link validity. Args: document: The document being validated url: Internal link URL line_number: Line number where link appears Returns: List of issues found """ issues = [] # Parse URL to extract path and fragment parsed = urllib.parse.urlparse(url) path = parsed.path fragment = parsed.fragment # Check if fragment points to existing section if fragment: section_found = self._check_fragment_exists(document, fragment) if not section_found: issues.append(BrokenInternalLink( link=url, severity='ERROR', message=f'Internal link target not found: #{fragment}', line_number=line_number, link_type='internal', target_section=fragment )) # Check if path points to existing file (if it's a file path) if path and not path.startswith('#'): # Try to resolve relative to document's directory if hasattr(document, 'file_path'): doc_dir = Path(document.file_path).parent target_path = (doc_dir / path).resolve() if not target_path.exists(): issues.append(BrokenInternalLink( link=url, severity='ERROR', message=f'Internal link file not found: {path}', line_number=line_number, link_type='internal', target_section=path )) return issues def _check_fragment_exists(self, document: 'MarkdownDocument', fragment: str) -> bool: """ Check if a fragment identifier exists in the document. Args: document: The document to search fragment: Fragment identifier (without #) Returns: True if fragment exists, False otherwise """ # Try to get headings from document if hasattr(document, 'get_headings_by_level'): # Check all heading levels for level in range(1, 7): headings = document.get_headings_by_level(level) for heading in headings: # Get heading text if isinstance(heading, dict): heading_text = heading.get('content', '') else: heading_text = str(heading) # Convert heading to fragment ID # (lowercase, spaces to hyphens, remove special chars) heading_id = self._heading_to_fragment_id(heading_text) if heading_id == fragment.lower(): return True return False def _heading_to_fragment_id(self, heading: str) -> str: """ Convert heading text to fragment ID. Args: heading: Heading text Returns: Fragment ID (lowercase, hyphens for spaces) """ # Lowercase fragment = heading.lower() # Remove special characters except spaces and hyphens fragment = re.sub(r'[^\w\s-]', '', fragment) # Replace spaces with hyphens fragment = re.sub(r'\s+', '-', fragment) # Remove multiple consecutive hyphens fragment = re.sub(r'-+', '-', fragment) # Strip leading/trailing hyphens fragment = fragment.strip('-') return fragment def _check_external_link(self, url: str, line_number: Optional[int]) -> List[LinkIssue]: """ Check external link validity (HTTP HEAD request). Args: url: External link URL line_number: Line number where link appears Returns: List of issues found """ issues = [] # Normalize URL (add https:// if starts with //) if url.startswith('//'): url = 'https:' + url try: # Use HEAD request for efficiency request = urllib.request.Request(url, method='HEAD') request.add_header('User-Agent', 'MarkiTect-LinkValidator/1.0') with urllib.request.urlopen(request, timeout=self.timeout) as response: # 2xx and 3xx are considered valid if response.status >= 400: issues.append(BrokenExternalLink( link=url, severity='WARNING', message=f'External link returned status {response.status}', line_number=line_number, link_type='external', status_code=response.status )) except HTTPError as e: issues.append(BrokenExternalLink( link=url, severity='WARNING', message=f'External link returned HTTP {e.code}', line_number=line_number, link_type='external', status_code=e.code )) except URLError as e: issues.append(BrokenExternalLink( link=url, severity='WARNING', message=f'External link unreachable: {e.reason}', line_number=line_number, link_type='external' )) except Exception as e: issues.append(BrokenExternalLink( link=url, severity='WARNING', message=f'External link check failed: {str(e)}', line_number=line_number, link_type='external' )) return issues def _check_email_link(self, url: str, line_number: Optional[int]) -> List[LinkIssue]: """ Check email link validity. Args: url: Email link (mailto:...) line_number: Line number where link appears Returns: List of issues found """ issues = [] # Extract email address email = url.replace('mailto:', '', 1).strip() # Basic email validation regex email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' if not re.match(email_pattern, email): issues.append(InvalidEmail( link=url, severity='WARNING', message=f'Invalid email address format', line_number=line_number, link_type='email' )) return issues