""" Semantic Validator for markdown documents. Validates markdown documents against x-markitect schema extensions: - x-markitect-sections: Section classifications (required, recommended, etc.) - x-markitect-content-control: Content patterns and quality metrics - Link validation: Internal and external link checking Complements the existing SchemaValidator which handles structural AST validation. """ from dataclasses import dataclass from typing import List, Dict, Any, Optional from pathlib import Path import json from markitect.validators.section_validator import ( SectionValidator, SectionValidationResult ) from markitect.validators.content_validator import ( ContentValidator, ContentValidationResult ) from markitect.validators.link_validator import ( LinkValidator, LinkValidationResult ) class DocumentWrapper: """ Wrapper for document dict to provide expected interface for validators. Extracts headings from AST and provides get_headings_by_level() method. """ def __init__(self, doc_dict: Dict[str, Any]): """Initialize wrapper with document dict from DocumentManager.""" self.doc_dict = doc_dict self._headings_cache = None self._extract_headings() def _extract_headings(self): """Extract headings from AST and cache them.""" ast = self.doc_dict.get('ast', []) headings = [] # Parse AST tokens to find headings # AST format: heading_open, inline (with content), heading_close i = 0 while i < len(ast): token = ast[i] if isinstance(token, dict) and token.get('type') == 'heading_open': level_str = token.get('tag', 'h1')[1:] # 'h2' -> '2' level = int(level_str) if level_str.isdigit() else 1 # Next token should be inline with heading content if i + 1 < len(ast) and ast[i + 1].get('type') == 'inline': content = ast[i + 1].get('content', '') line_number = token.get('map', [0])[0] + 1 if token.get('map') else None headings.append({ 'content': content, 'level': level, 'line_number': line_number }) i += 1 self._headings_cache = headings def get_headings_by_level(self, level: int) -> List[Dict[str, Any]]: """ Get headings at specified level. Args: level: Heading level (1-6) Returns: List of heading dicts with 'content', 'level', 'line_number' """ if self._headings_cache is None: self._extract_headings() return [h for h in self._headings_cache if h.get('level') == level] @property def headings(self) -> List[Dict[str, Any]]: """Get all headings.""" if self._headings_cache is None: self._extract_headings() return self._headings_cache def __getitem__(self, key): """Allow dict-like access for compatibility.""" return self.doc_dict[key] def get(self, key, default=None): """Allow dict-like get for compatibility.""" return self.doc_dict.get(key, default) @dataclass class SemanticValidationReport: """ Report of semantic validation results. Combines results from section, content, and link validators. """ section_result: SectionValidationResult content_result: Optional[ContentValidationResult] = None link_result: Optional[LinkValidationResult] = None def has_errors(self) -> bool: """Check if there are any ERROR-level issues.""" errors = self.section_result.has_errors() if self.content_result and hasattr(self.content_result, 'has_errors'): errors = errors or self.content_result.has_errors() if self.link_result and hasattr(self.link_result, 'has_errors'): errors = errors or self.link_result.has_errors() return errors def has_warnings(self) -> bool: """Check if there are any WARNING-level issues.""" warnings = self.section_result.has_warnings() if self.content_result and hasattr(self.content_result, 'has_warnings'): warnings = warnings or self.content_result.has_warnings() if self.link_result and hasattr(self.link_result, 'has_warnings'): warnings = warnings or self.link_result.has_warnings() return warnings def is_valid(self) -> bool: """Check if validation passed (no errors).""" return not self.has_errors() def get_all_issues(self) -> List[Any]: """Get all issues from all validators.""" issues = list(self.section_result.issues) if self.content_result and hasattr(self.content_result, 'issues'): issues.extend(self.content_result.issues) if self.link_result and hasattr(self.link_result, 'issues'): issues.extend(self.link_result.issues) return issues def format_text(self) -> str: """Format validation report as text.""" lines = [] # Section validation lines.append("Section Validation:") if self.section_result.issues: for issue in self.section_result.issues: status = "❌" if issue.severity == 'ERROR' else "⚠️" lines.append(f" {status} {issue.section_name} - {issue.message}") else: lines.append(" ✅ All section requirements met") # Content validation if self.content_result: lines.append("") lines.append("Content Validation:") if self.content_result.issues: for issue in self.content_result.issues: status = "❌" if issue.severity == 'ERROR' else "⚠️" lines.append(f" {status} {issue.section_name} - {issue.message}") else: lines.append(" ✅ All content requirements met") # Link validation if self.link_result: lines.append("") lines.append("Link Validation:") if self.link_result.issues: for issue in self.link_result.issues: status = "❌" if issue.severity == 'ERROR' else "⚠️" lines.append(f" {status} {issue.link} - {issue.message}") else: lines.append(f" ✅ All {self.link_result.links_checked} links valid") # Summary lines.append("") lines.append("Summary:") lines.append(f" Sections checked: {self.section_result.sections_checked}") lines.append(f" Sections found: {self.section_result.sections_found}") all_errors = self.section_result.get_errors() all_warnings = self.section_result.get_warnings() if self.content_result: all_errors.extend(self.content_result.get_errors()) all_warnings.extend(self.content_result.get_warnings()) if self.link_result: all_errors.extend(self.link_result.get_errors()) all_warnings.extend(self.link_result.get_warnings()) lines.append(f" Errors: {len(all_errors)}") lines.append(f" Warnings: {len(all_warnings)}") if self.is_valid(): lines.append(" Status: PASSED ✅") else: lines.append(" Status: FAILED ❌") return "\n".join(lines) class SemanticValidator: """ Validates markdown documents against x-markitect extensions. Complements existing SchemaValidator which handles structural AST validation. This validator checks semantic aspects defined in x-markitect-* extensions. Example: >>> schema = load_schema('manpage-schema-v1.0.md') >>> validator = SemanticValidator(schema) >>> report = validator.validate('my-command.1.md') >>> if not report.is_valid(): ... print(report.format_text()) """ def __init__(self, schema: Dict[str, Any]): """ Initialize semantic validator with a schema. Args: schema: JSON schema with x-markitect-* extensions The schema can be either: - A dict loaded from JSON - A dict loaded from markdown with embedded JSON - Must contain x-markitect-sections and/or x-markitect-content-control """ self.schema = schema # Initialize sub-validators self.section_validator = SectionValidator(schema) self.content_validator = ContentValidator(schema) self.link_validator = LinkValidator(schema) def validate(self, document_path: str | Path, check_links: bool = False) -> SemanticValidationReport: """ Validate a markdown document against schema extensions. Args: document_path: Path to markdown document to validate check_links: Whether to validate links (may be slow) Returns: SemanticValidationReport with validation results Raises: FileNotFoundError: If document_path doesn't exist ValueError: If document cannot be parsed """ document_path = Path(document_path) if not document_path.exists(): raise FileNotFoundError(f"Document not found: {document_path}") # Parse document document = self._parse_document(document_path) # Run section validation section_result = self.section_validator.check(document) # Run content validation content_result = self.content_validator.check(document) # Run link validation (if enabled) if check_links: link_result = self.link_validator.check(document, check_external=True) else: # Still check internal links by default (fast) link_result = self.link_validator.check(document, check_external=False) return SemanticValidationReport( section_result=section_result, content_result=content_result, link_result=link_result ) def _parse_document(self, document_path: Path) -> 'MarkdownDocument': """ Parse markdown document into AST. Args: document_path: Path to markdown file Returns: Parsed MarkdownDocument object This uses the existing markitect markdown parser. """ # Import here to avoid circular dependency from markitect.document_manager import DocumentManager # Use DocumentManager to parse the document doc_manager = DocumentManager() doc = doc_manager.ingest_file(document_path) # Wrap in DocumentWrapper to provide expected interface return DocumentWrapper(doc) def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]: """ Load a schema from file (supports .json and .md formats). Args: schema_path: Path to schema file Returns: Schema dict with embedded JSON Raises: FileNotFoundError: If schema file doesn't exist ValueError: If schema cannot be parsed """ schema_path = Path(schema_path) if not schema_path.exists(): raise FileNotFoundError(f"Schema not found: {schema_path}") if schema_path.suffix == '.json': # Load JSON schema directly with open(schema_path, 'r', encoding='utf-8') as f: return json.load(f) elif schema_path.suffix == '.md': # Load markdown schema with embedded JSON from markitect.schema_loader import MarkdownSchemaLoader loader = MarkdownSchemaLoader() schema_data = loader.load_schema(schema_path) return schema_data['schema'] else: raise ValueError(f"Unsupported schema format: {schema_path.suffix}")