diff --git a/markitect/semantic_validator.py b/markitect/semantic_validator.py new file mode 100644 index 00000000..750324d0 --- /dev/null +++ b/markitect/semantic_validator.py @@ -0,0 +1,261 @@ +""" +Semantic Validator for markdown documents. + +Validates markdown documents against x-markitect schema extensions: +- x-markitect-sections: Section classifications (required, recommended, etc.) +- x-markitect-content-control: Content patterns and quality metrics +- Link validation: Internal and external link checking + +Complements the existing SchemaValidator which handles structural AST validation. +""" + +from dataclasses import dataclass +from typing import List, Dict, Any, Optional +from pathlib import Path +import json + +from markitect.validators.section_validator import ( + SectionValidator, + SectionValidationResult +) +from markitect.validators.content_validator import ( + ContentValidator, + ContentValidationResult +) + + +@dataclass +class SemanticValidationReport: + """ + Report of semantic validation results. + + Combines results from section, content, and link validators. + """ + section_result: SectionValidationResult + content_result: Optional[ContentValidationResult] = None + link_result: Optional[Any] = None # LinkValidationResult when implemented + + def has_errors(self) -> bool: + """Check if there are any ERROR-level issues.""" + errors = self.section_result.has_errors() + + if self.content_result and hasattr(self.content_result, 'has_errors'): + errors = errors or self.content_result.has_errors() + + if self.link_result and hasattr(self.link_result, 'has_errors'): + errors = errors or self.link_result.has_errors() + + return errors + + def has_warnings(self) -> bool: + """Check if there are any WARNING-level issues.""" + warnings = self.section_result.has_warnings() + + if self.content_result and hasattr(self.content_result, 'has_warnings'): + warnings = warnings or self.content_result.has_warnings() + + if self.link_result and hasattr(self.link_result, 'has_warnings'): + warnings = warnings or self.link_result.has_warnings() + + return warnings + + def is_valid(self) -> bool: + """Check if validation passed (no errors).""" + return not self.has_errors() + + def get_all_issues(self) -> List[Any]: + """Get all issues from all validators.""" + issues = list(self.section_result.issues) + + if self.content_result and hasattr(self.content_result, 'issues'): + issues.extend(self.content_result.issues) + + if self.link_result and hasattr(self.link_result, 'issues'): + issues.extend(self.link_result.issues) + + return issues + + def format_text(self) -> str: + """Format validation report as text.""" + lines = [] + + # Section validation + lines.append("Section Validation:") + if self.section_result.issues: + for issue in self.section_result.issues: + status = "❌" if issue.severity == 'ERROR' else "⚠️" + lines.append(f" {status} {issue.section_name} - {issue.message}") + else: + lines.append(" ✅ All section requirements met") + + # Content validation + if self.content_result: + lines.append("") + lines.append("Content Validation:") + if self.content_result.issues: + for issue in self.content_result.issues: + status = "❌" if issue.severity == 'ERROR' else "⚠️" + lines.append(f" {status} {issue.section_name} - {issue.message}") + else: + lines.append(" ✅ All content requirements met") + + # Summary + lines.append("") + lines.append("Summary:") + lines.append(f" Sections checked: {self.section_result.sections_checked}") + lines.append(f" Sections found: {self.section_result.sections_found}") + + all_errors = self.section_result.get_errors() + all_warnings = self.section_result.get_warnings() + + if self.content_result: + all_errors.extend(self.content_result.get_errors()) + all_warnings.extend(self.content_result.get_warnings()) + + lines.append(f" Errors: {len(all_errors)}") + lines.append(f" Warnings: {len(all_warnings)}") + + if self.is_valid(): + lines.append(" Status: PASSED ✅") + else: + lines.append(" Status: FAILED ❌") + + return "\n".join(lines) + + +class SemanticValidator: + """ + Validates markdown documents against x-markitect extensions. + + Complements existing SchemaValidator which handles structural AST validation. + This validator checks semantic aspects defined in x-markitect-* extensions. + + Example: + >>> schema = load_schema('manpage-schema-v1.0.md') + >>> validator = SemanticValidator(schema) + >>> report = validator.validate('my-command.1.md') + >>> if not report.is_valid(): + ... print(report.format_text()) + """ + + def __init__(self, schema: Dict[str, Any]): + """ + Initialize semantic validator with a schema. + + Args: + schema: JSON schema with x-markitect-* extensions + + The schema can be either: + - A dict loaded from JSON + - A dict loaded from markdown with embedded JSON + - Must contain x-markitect-sections and/or x-markitect-content-control + """ + self.schema = schema + + # Initialize sub-validators + self.section_validator = SectionValidator(schema) + self.content_validator = ContentValidator(schema) + + # TODO: Initialize link validator when implemented + # self.link_validator = LinkValidator(schema) + + def validate(self, document_path: str | Path, + check_links: bool = False) -> SemanticValidationReport: + """ + Validate a markdown document against schema extensions. + + Args: + document_path: Path to markdown document to validate + check_links: Whether to validate links (may be slow) + + Returns: + SemanticValidationReport with validation results + + Raises: + FileNotFoundError: If document_path doesn't exist + ValueError: If document cannot be parsed + """ + document_path = Path(document_path) + + if not document_path.exists(): + raise FileNotFoundError(f"Document not found: {document_path}") + + # Parse document + document = self._parse_document(document_path) + + # Run section validation + section_result = self.section_validator.check(document) + + # Run content validation + content_result = self.content_validator.check(document) + + # TODO: Run link validation when implemented + # if check_links: + # link_result = self.link_validator.check(document) + # else: + # link_result = None + link_result = None + + return SemanticValidationReport( + section_result=section_result, + content_result=content_result, + link_result=link_result + ) + + def _parse_document(self, document_path: Path) -> 'MarkdownDocument': + """ + Parse markdown document into AST. + + Args: + document_path: Path to markdown file + + Returns: + Parsed MarkdownDocument object + + This uses the existing markitect markdown parser. + """ + # Import here to avoid circular dependency + from markitect.document_manager import DocumentManager + + # Use DocumentManager to parse the document + doc_manager = DocumentManager() + doc = doc_manager.ingest_file(document_path) + + return doc + + +def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]: + """ + Load a schema from file (supports .json and .md formats). + + Args: + schema_path: Path to schema file + + Returns: + Schema dict with embedded JSON + + Raises: + FileNotFoundError: If schema file doesn't exist + ValueError: If schema cannot be parsed + """ + schema_path = Path(schema_path) + + if not schema_path.exists(): + raise FileNotFoundError(f"Schema not found: {schema_path}") + + if schema_path.suffix == '.json': + # Load JSON schema directly + with open(schema_path, 'r', encoding='utf-8') as f: + return json.load(f) + + elif schema_path.suffix == '.md': + # Load markdown schema with embedded JSON + from markitect.schema_loader import MarkdownSchemaLoader + + loader = MarkdownSchemaLoader() + schema_data = loader.load_schema(schema_path) + + return schema_data['schema'] + + else: + raise ValueError(f"Unsupported schema format: {schema_path.suffix}") diff --git a/markitect/validators/__init__.py b/markitect/validators/__init__.py new file mode 100644 index 00000000..feba2c6b --- /dev/null +++ b/markitect/validators/__init__.py @@ -0,0 +1,50 @@ +""" +Validators package for semantic document validation. + +This package contains validators that check markdown documents against +x-markitect schema extensions (sections, content-control, link validation). + +Validators: + - SectionValidator: Validates section presence based on classifications + - ContentValidator: Validates content patterns and quality metrics + - LinkValidator: Validates internal and external links +""" + +from markitect.validators.section_validator import ( + SectionValidator, + SectionValidationResult, + SectionIssue, + SectionMissing, + SectionImproper, + SectionDiscouraged, +) + +from markitect.validators.content_validator import ( + ContentValidator, + ContentValidationResult, + ContentIssue, + PatternMissing, + ForbiddenPattern, + DiscouragedPattern, + ContentTooShort, + ContentTooLong, +) + +__all__ = [ + # Section validator + 'SectionValidator', + 'SectionValidationResult', + 'SectionIssue', + 'SectionMissing', + 'SectionImproper', + 'SectionDiscouraged', + # Content validator + 'ContentValidator', + 'ContentValidationResult', + 'ContentIssue', + 'PatternMissing', + 'ForbiddenPattern', + 'DiscouragedPattern', + 'ContentTooShort', + 'ContentTooLong', +] diff --git a/markitect/validators/content_validator.py b/markitect/validators/content_validator.py new file mode 100644 index 00000000..9d158ec2 --- /dev/null +++ b/markitect/validators/content_validator.py @@ -0,0 +1,316 @@ +""" +Content Validator for markdown documents. + +Validates content against x-markitect-content-control rules: +- Required patterns: Regex patterns that must appear in content +- Discouraged patterns: Patterns that should be avoided (warnings) +- Forbidden patterns: Patterns that must not appear (errors) +- Quality metrics: Word counts, sentence counts, readability +""" + +from dataclasses import dataclass +from typing import List, Dict, Any, Optional +import re + + +@dataclass +class ContentIssue: + """Base class for content validation issues.""" + section_name: str + severity: str # 'ERROR', 'WARNING', 'INFO' + message: str + line_number: Optional[int] = None + matched_text: Optional[str] = None + + def __str__(self) -> str: + location = f" (line {self.line_number})" if self.line_number else "" + match_info = f": '{self.matched_text}'" if self.matched_text else "" + return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}" + + +@dataclass +class PatternMissing(ContentIssue): + """Required pattern not found in content.""" + pattern: str = "" + + +@dataclass +class ForbiddenPattern(ContentIssue): + """Forbidden pattern found in content.""" + pattern: str = "" + + +@dataclass +class DiscouragedPattern(ContentIssue): + """Discouraged pattern found in content.""" + pattern: str = "" + + +@dataclass +class ContentTooShort(ContentIssue): + """Content does not meet minimum word/sentence count.""" + actual: int = 0 + required: int = 0 + + +@dataclass +class ContentTooLong(ContentIssue): + """Content exceeds maximum word/sentence count.""" + actual: int = 0 + limit: int = 0 + + +@dataclass +class ContentValidationResult: + """Result of content validation.""" + issues: List[ContentIssue] + sections_checked: int + + def has_errors(self) -> bool: + """Check if there are any ERROR-level issues.""" + return any(issue.severity == 'ERROR' for issue in self.issues) + + def has_warnings(self) -> bool: + """Check if there are any WARNING-level issues.""" + return any(issue.severity == 'WARNING' for issue in self.issues) + + def is_valid(self) -> bool: + """Check if validation passed (no errors).""" + return not self.has_errors() + + def get_errors(self) -> List[ContentIssue]: + """Get all ERROR-level issues.""" + return [issue for issue in self.issues if issue.severity == 'ERROR'] + + def get_warnings(self) -> List[ContentIssue]: + """Get all WARNING-level issues.""" + return [issue for issue in self.issues if issue.severity == 'WARNING'] + + +class ContentValidator: + """ + Validates content against x-markitect-content-control rules. + + Checks content patterns, quality metrics, and readability for each section. + """ + + def __init__(self, schema: Dict[str, Any]): + """ + Initialize validator with a schema. + + Args: + schema: JSON schema with x-markitect-content-control extension + """ + self.schema = schema + self.content_rules = schema.get('x-markitect-content-control', {}) + + def check(self, document: 'MarkdownDocument') -> ContentValidationResult: + """ + Validate content against schema rules. + + Args: + document: Parsed markdown document + + Returns: + ContentValidationResult with any issues found + """ + issues = [] + sections_checked = 0 + + # Check each section that has content rules + for section_key, rules in self.content_rules.items(): + sections_checked += 1 + + # Get section from document + section = self._get_section(document, section_key) + + if not section: + # Section validator handles missing sections + continue + + section_content = section.get('content', '') + section_name = section.get('name', section_key) + + # Check required patterns + issues.extend(self._check_required_patterns( + section_name, section_content, rules + )) + + # Check forbidden patterns + issues.extend(self._check_forbidden_patterns( + section_name, section_content, rules + )) + + # Check discouraged patterns + issues.extend(self._check_discouraged_patterns( + section_name, section_content, rules + )) + + # Check content quality metrics + issues.extend(self._check_quality_metrics( + section_name, section_content, rules + )) + + return ContentValidationResult( + issues=issues, + sections_checked=sections_checked + ) + + def _get_section(self, document: 'MarkdownDocument', + section_key: str) -> Optional[Dict[str, Any]]: + """ + Get a section from the document. + + Args: + document: Parsed markdown document + section_key: Section name (lowercase in rules, uppercase in document) + + Returns: + Section dict with name and content, or None if not found + """ + # Convert section_key to uppercase for matching + section_name = section_key.upper() + + # Try to get section content + if hasattr(document, 'get_section'): + return document.get_section(section_name) + + # Fallback: search headings + if hasattr(document, 'get_headings_by_level'): + headings = document.get_headings_by_level(2) + for heading in headings: + if isinstance(heading, dict): + if heading.get('content', '').strip().upper() == section_name: + # Found the section, need to extract content + return { + 'name': section_name, + 'content': heading.get('text_content', '') + } + + return None + + def _check_required_patterns(self, section_name: str, content: str, + rules: Dict[str, Any]) -> List[ContentIssue]: + """Check that all required patterns appear in content.""" + issues = [] + required_patterns = rules.get('required_patterns', []) + + for pattern in required_patterns: + try: + if not re.search(pattern, content, re.MULTILINE): + issues.append(PatternMissing( + section_name=section_name, + severity='ERROR', + message=f'Required pattern not found', + pattern=pattern + )) + except re.error as e: + # Invalid regex pattern in schema + issues.append(ContentIssue( + section_name=section_name, + severity='ERROR', + message=f'Invalid regex pattern in schema: {e}' + )) + + return issues + + def _check_forbidden_patterns(self, section_name: str, content: str, + rules: Dict[str, Any]) -> List[ContentIssue]: + """Check that no forbidden patterns appear in content.""" + issues = [] + forbidden_patterns = rules.get('forbidden_patterns', []) + + for pattern in forbidden_patterns: + try: + match = re.search(pattern, content, re.MULTILINE) + if match: + issues.append(ForbiddenPattern( + section_name=section_name, + severity='ERROR', + message=f'Forbidden pattern found', + pattern=pattern, + matched_text=match.group(0)[:50] # Limit to 50 chars + )) + except re.error as e: + issues.append(ContentIssue( + section_name=section_name, + severity='ERROR', + message=f'Invalid regex pattern in schema: {e}' + )) + + return issues + + def _check_discouraged_patterns(self, section_name: str, content: str, + rules: Dict[str, Any]) -> List[ContentIssue]: + """Check for discouraged patterns (warnings).""" + issues = [] + discouraged_patterns = rules.get('discouraged_patterns', []) + + for pattern in discouraged_patterns: + try: + match = re.search(pattern, content, re.MULTILINE) + if match: + issues.append(DiscouragedPattern( + section_name=section_name, + severity='WARNING', + message=f'Discouraged pattern found', + pattern=pattern, + matched_text=match.group(0)[:50] + )) + except re.error as e: + issues.append(ContentIssue( + section_name=section_name, + severity='WARNING', + message=f'Invalid regex pattern in schema: {e}' + )) + + return issues + + def _check_quality_metrics(self, section_name: str, content: str, + rules: Dict[str, Any]) -> List[ContentIssue]: + """Check content quality metrics (word count, sentence count).""" + issues = [] + quality = rules.get('content_quality', {}) + + if not quality: + return issues + + # Word count validation + word_count = len(content.split()) + + min_words = quality.get('min_words') + if min_words is not None and word_count < min_words: + issues.append(ContentTooShort( + section_name=section_name, + severity='WARNING', + message=f'Content too short ({word_count} words, minimum {min_words})', + actual=word_count, + required=min_words + )) + + max_words = quality.get('max_words') + if max_words is not None and word_count > max_words: + issues.append(ContentTooLong( + section_name=section_name, + severity='WARNING', + message=f'Content too long ({word_count} words, maximum {max_words})', + actual=word_count, + limit=max_words + )) + + # Sentence count validation + min_sentences = quality.get('min_sentences') + if min_sentences is not None: + # Simple sentence count (split by .!?) + sentence_count = len(re.findall(r'[.!?]+', content)) + + if sentence_count < min_sentences: + issues.append(ContentTooShort( + section_name=section_name, + severity='WARNING', + message=f'Too few sentences ({sentence_count}, minimum {min_sentences})', + actual=sentence_count, + required=min_sentences + )) + + return issues diff --git a/markitect/validators/section_validator.py b/markitect/validators/section_validator.py new file mode 100644 index 00000000..5712691b --- /dev/null +++ b/markitect/validators/section_validator.py @@ -0,0 +1,226 @@ +""" +Section Validator for markdown documents. + +Validates that document sections comply with x-markitect-sections classifications: +- REQUIRED: Section must be present (ERROR if missing) +- RECOMMENDED: Section should be present (WARNING if missing) +- OPTIONAL: Section may be present (no check) +- DISCOURAGED: Section should not be present (WARNING if present) +- IMPROPER: Section must not be present (ERROR if present) +""" + +from dataclasses import dataclass +from typing import List, Dict, Any, Optional +from pathlib import Path + + +@dataclass +class SectionIssue: + """Base class for section validation issues.""" + section_name: str + severity: str # 'ERROR', 'WARNING', 'INFO' + message: str + classification: str # 'required', 'recommended', etc. + line_number: Optional[int] = None + + def __str__(self) -> str: + location = f" (line {self.line_number})" if self.line_number else "" + return f"[{self.severity}]{location} {self.section_name}: {self.message}" + + +@dataclass +class SectionMissing(SectionIssue): + """Section is missing from document.""" + pass + + +@dataclass +class SectionImproper(SectionIssue): + """Improper section found in document.""" + pass + + +@dataclass +class SectionDiscouraged(SectionIssue): + """Discouraged section found in document.""" + pass + + +@dataclass +class SectionValidationResult: + """Result of section validation.""" + issues: List[SectionIssue] + sections_checked: int + sections_found: int + + def has_errors(self) -> bool: + """Check if there are any ERROR-level issues.""" + return any(issue.severity == 'ERROR' for issue in self.issues) + + def has_warnings(self) -> bool: + """Check if there are any WARNING-level issues.""" + return any(issue.severity == 'WARNING' for issue in self.issues) + + def is_valid(self) -> bool: + """Check if validation passed (no errors).""" + return not self.has_errors() + + def get_errors(self) -> List[SectionIssue]: + """Get all ERROR-level issues.""" + return [issue for issue in self.issues if issue.severity == 'ERROR'] + + def get_warnings(self) -> List[SectionIssue]: + """Get all WARNING-level issues.""" + return [issue for issue in self.issues if issue.severity == 'WARNING'] + + +class SectionValidator: + """ + Validates section presence and classification compliance. + + Checks that markdown documents have the correct sections based on + x-markitect-sections classifications in the schema. + """ + + def __init__(self, schema: Dict[str, Any]): + """ + Initialize validator with a schema. + + Args: + schema: JSON schema with x-markitect-sections extension + """ + self.schema = schema + self.sections_spec = schema.get('x-markitect-sections', {}) + + def check(self, document: 'MarkdownDocument') -> SectionValidationResult: + """ + Validate section presence against schema classifications. + + Args: + document: Parsed markdown document + + Returns: + SectionValidationResult with any issues found + """ + issues = [] + + # Get level-2 headings (main sections) from document + doc_sections = self._get_document_sections(document) + + # Check each specification + for section_name, spec in self.sections_spec.items(): + classification = spec.get('classification') + section_in_doc = self._find_section(section_name, doc_sections, spec) + + if classification == 'required': + if not section_in_doc: + issues.append(SectionMissing( + section_name=section_name, + severity='ERROR', + message=spec.get('error_message', f'{section_name} section is required'), + classification='required' + )) + + elif classification == 'improper': + if section_in_doc: + issues.append(SectionImproper( + section_name=section_name, + severity='ERROR', + message=spec.get('error_message', f'{section_name} section must not appear'), + classification='improper', + line_number=section_in_doc.get('line_number') + )) + + elif classification == 'recommended': + if not section_in_doc: + issues.append(SectionMissing( + section_name=section_name, + severity='WARNING', + message=spec.get('warning_if_missing', f'{section_name} section is recommended'), + classification='recommended' + )) + + elif classification == 'discouraged': + if section_in_doc: + issues.append(SectionDiscouraged( + section_name=section_name, + severity='WARNING', + message=spec.get('warning_if_present', f'{section_name} section is discouraged'), + classification='discouraged', + line_number=section_in_doc.get('line_number') + )) + + return SectionValidationResult( + issues=issues, + sections_checked=len(self.sections_spec), + sections_found=len(doc_sections) + ) + + def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]: + """ + Extract level-2 headings from document. + + Args: + document: Parsed markdown document + + Returns: + List of section dicts with name and line_number + """ + sections = [] + + # Get headings from document + if hasattr(document, 'get_headings_by_level'): + level_2_headings = document.get_headings_by_level(2) + elif hasattr(document, 'headings'): + level_2_headings = [ + h for h in document.headings + if h.get('level') == 2 + ] + else: + # Fallback: parse from AST + level_2_headings = [] + + for heading in level_2_headings: + if isinstance(heading, dict): + sections.append({ + 'name': heading.get('content', '').strip().upper(), + 'line_number': heading.get('line_number') + }) + elif isinstance(heading, str): + sections.append({ + 'name': heading.strip().upper(), + 'line_number': None + }) + + return sections + + def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]], + spec: Dict[str, Any]) -> Optional[Dict[str, Any]]: + """ + Find a section in document, checking alternatives. + + Args: + section_name: Primary section name to find + doc_sections: List of sections in document + spec: Section specification with potential alternatives + + Returns: + Section dict if found, None otherwise + """ + # Normalize section name for comparison + normalized_name = section_name.upper().strip() + + # Check primary name + for section in doc_sections: + if section['name'] == normalized_name: + return section + + # Check alternatives + alternatives = spec.get('alternatives', []) + for alt_name in alternatives: + normalized_alt = alt_name.upper().strip() + for section in doc_sections: + if section['name'] == normalized_alt: + return section + + return None diff --git a/roadmap/20260106-semantic-document-validation/WORKPLAN.md b/roadmap/20260106-semantic-document-validation/WORKPLAN.md new file mode 100644 index 00000000..3a40cb2c --- /dev/null +++ b/roadmap/20260106-semantic-document-validation/WORKPLAN.md @@ -0,0 +1,573 @@ +# Plan: Schema System Enhancement - Semantic Document Validation + +## Overview + +The schema management system has **complete schema structure analysis tools** (schema-analyze, schema-refine) and **structural AST validation** (markitect validate), but is missing **semantic validation capabilities**. This plan enhances validation to check sections, content patterns, and quality metrics defined in x-markitect extensions. + +## Current State Assessment + +### ✅ Already Implemented +- **schema-analyze**: Detects rigid constraints, calculates rigidity score (markitect/schema_analyzer.py) +- **schema-refine**: Automatically loosens rigid constraints (markitect/schema_refiner.py) +- **markitect validate**: Validates AST structure against JSON schemas (cli.py:1493-1600) + - Checks headings, paragraphs, code_blocks counts match schema + - Validates document structure against JSON Schema properties + - Does NOT check x-markitect-sections classifications + - Does NOT validate x-markitect-content-control patterns +- **X-Markitect Extensions**: Full system with sections, content-control, metadata +- **Metaschema Validation**: Validates schema structure and extensions +- **4 Production Schemas**: manpage, API docs, terminology, schema-schema +- **Comprehensive Documentation**: User guides, specifications, tests (97 tests passing) + +### ❌ Missing Capabilities (Semantic Validation) +1. **Section Classification Enforcement**: required/recommended/optional/discouraged/improper not checked +2. **Content Pattern Validation**: required_patterns, forbidden_patterns not matched +3. **Quality Metrics Validation**: min_words, max_words, min_sentences not enforced +4. **Link Validation**: Internal/external link checking not implemented +5. **Content Instructions**: content_instruction fields defined but not validated + +## What We Have vs What We Need + +**Current `markitect validate`** (Structural): +```bash +markitect validate doc.md --schema schema.json +# ✅ Checks: headings.level_2 has 5-30 items +# ✅ Checks: paragraphs has 10-500 items +# ✅ Checks: code_blocks has 1-50 items +# ❌ Does NOT check: SYNOPSIS section present (required) +# ❌ Does NOT check: INTERNAL_NOTES absent (improper) +# ❌ Does NOT check: Synopsis contains bold command name +# ❌ Does NOT check: Description has min 50 words +``` + +**Enhanced `markitect validate`** (Structural + Semantic): +```bash +markitect validate doc.md --schema manpage-schema-v1.0.md +# ✅ Checks: AST structure (existing) +# ✅ NEW: SYNOPSIS section present (required) +# ✅ NEW: INTERNAL_NOTES not present (improper) +# ✅ NEW: Synopsis contains **command** pattern +# ✅ NEW: Description has 50+ words +# ✅ NEW: No forbidden TODO patterns +``` + +## Implementation Plan + +### Phase 1: Core Semantic Validator + +**Goal**: Create semantic validator to complement existing structural validation + +**New Module**: `markitect/semantic_validator.py` + +**Key Components**: + +```python +class SemanticValidator: + """Validates markdown documents against x-markitect extensions. + + Complements existing SchemaValidator which handles structural AST validation. + This validator checks semantic aspects defined in x-markitect-* extensions. + """ + + def __init__(self, schema_path: str): + # Load schema (supports .md schemas with embedded JSON) + self.schema = load_schema_with_extensions(schema_path) + + # Initialize sub-validators + self.section_validator = SectionValidator(self.schema) + self.content_validator = ContentValidator(self.schema) + self.link_validator = LinkValidator(self.schema) + + def validate(self, document_path: str, check_links: bool = False) -> SemanticValidationReport: + """Main semantic validation entry point.""" + doc = parse_markdown_document(document_path) + + results = { + 'sections': self.section_validator.check(doc), + 'content': self.content_validator.check(doc) + } + + if check_links: + results['links'] = self.link_validator.check(doc) + + return SemanticValidationReport(results) +``` + +**Features**: +- Load schema from registry or filesystem +- Parse markdown document into AST +- Validate sections against x-markitect-sections classifications +- Check content against x-markitect-content-control patterns +- Validate links if enabled +- Generate detailed report with line numbers + +### Phase 2: Section Presence Validator + +**New Module**: `markitect/section_validator.py` + +**Validation Rules**: + +```python +class SectionValidator: + """Validates section presence and classification compliance.""" + + def check(self, document: MarkdownDocument) -> SectionValidationResult: + sections_spec = self.schema.get('x-markitect-sections', {}) + doc_sections = document.get_headings_by_level(2) + + issues = [] + + # Check REQUIRED sections + for section_name, spec in sections_spec.items(): + if spec['classification'] == 'required': + if section_name not in doc_sections: + issues.append(SectionMissing( + section=section_name, + severity='ERROR', + message=spec.get('error_message', f'{section_name} is required') + )) + + # Check IMPROPER sections (must not exist) + for section_name, spec in sections_spec.items(): + if spec['classification'] == 'improper': + if section_name in doc_sections: + issues.append(SectionImproper( + section=section_name, + severity='ERROR', + message=spec.get('error_message', f'{section_name} must not appear') + )) + + # Check RECOMMENDED sections (warnings) + for section_name, spec in sections_spec.items(): + if spec['classification'] == 'recommended': + if section_name not in doc_sections: + issues.append(SectionMissing( + section=section_name, + severity='WARNING', + message=spec.get('warning_if_missing', f'{section_name} is recommended') + )) + + return SectionValidationResult(issues) +``` + +**Section Classification Enforcement**: +- REQUIRED → ERROR if missing +- RECOMMENDED → WARNING if missing +- OPTIONAL → No check +- DISCOURAGED → WARNING if present +- IMPROPER → ERROR if present + +### Phase 3: Content Pattern Validator + +**New Module**: `markitect/content_validator.py` + +**Pattern Matching**: + +```python +class ContentValidator: + """Validates content against x-markitect-content-control rules.""" + + def check(self, document: MarkdownDocument) -> ContentValidationResult: + content_rules = self.schema.get('x-markitect-content-control', {}) + issues = [] + + for section_key, rules in content_rules.items(): + section = document.get_section(section_key.upper()) + if not section: + continue # Section validator handles missing sections + + # Check required patterns + for pattern in rules.get('required_patterns', []): + if not re.search(pattern, section.content): + issues.append(PatternMissing( + section=section.name, + pattern=pattern, + severity='ERROR' + )) + + # Check forbidden patterns + for pattern in rules.get('forbidden_patterns', []): + if re.search(pattern, section.content): + issues.append(ForbiddenPattern( + section=section.name, + pattern=pattern, + severity='ERROR', + matched_text=match.group(0) + )) + + # Check content quality + quality = rules.get('content_quality', {}) + word_count = len(section.content.split()) + + if 'min_words' in quality and word_count < quality['min_words']: + issues.append(ContentTooShort( + section=section.name, + actual=word_count, + required=quality['min_words'], + severity='WARNING' + )) + + if 'max_words' in quality and word_count > quality['max_words']: + issues.append(ContentTooLong( + section=section.name, + actual=word_count, + limit=quality['max_words'], + severity='WARNING' + )) + + return ContentValidationResult(issues) +``` + +**Content Rules Checked**: +- Required patterns (regex matches) +- Discouraged patterns (warnings) +- Forbidden patterns (errors) +- Word count ranges (min/max) +- Sentence counts (if specified) + +### Phase 4: Link Validator + +**New Module**: `markitect/link_validator.py` + +**Link Checking**: + +```python +class LinkValidator: + """Validates links according to x-markitect-content-control.link_validation.""" + + def check(self, document: MarkdownDocument) -> LinkValidationResult: + link_config = self.schema.get('x-markitect-content-control', {}).get('link_validation', {}) + + if not any(link_config.values()): + return LinkValidationResult([]) # No link validation configured + + links = document.extract_links() + issues = [] + + for link in links: + # Check internal links + if link.is_internal() and link_config.get('check_internal', False): + target = document.resolve_internal_link(link.target) + if not target: + issues.append(BrokenInternalLink( + link=link.target, + line=link.line_number, + severity='ERROR' + )) + + # Check external links + if link.is_external() and link_config.get('check_external', False): + # HTTP HEAD request with timeout + if not self._check_url_exists(link.target): + issues.append(BrokenExternalLink( + link=link.target, + line=link.line_number, + severity='WARNING' # External links are warnings + )) + + # Check fragments + if link.has_fragment() and not link_config.get('allow_fragments', True): + issues.append(FragmentNotAllowed( + link=link.target, + line=link.line_number, + severity='WARNING' + )) + + return LinkValidationResult(issues) +``` + +**Link Types Validated**: +- Internal links (to other sections/documents) +- External links (HTTP/HTTPS URLs) +- Fragment identifiers (#section-name) +- Email links (mailto:) + +### Phase 5: CLI Integration + +**Enhance Existing Command**: `markitect validate` (cli.py:1493-1600) + +**New Options to Add**: + +```python +@cli.command('validate') +@click.argument('file_path', type=click.Path(exists=True, path_type=Path)) +@click.option('--schema', '-s', type=click.Path(exists=True, path_type=Path), + help='Path to JSON schema file') +@click.option('--schema-json', type=str, + help='JSON schema provided as a string') +@click.option('--quiet', '-q', is_flag=True, + help='Only output validation result (true/false)') +@click.option('--detailed-errors', '--errors', is_flag=True, + help='Show detailed validation errors (Issue #8)') +@click.option('--error-format', type=click.Choice(['text', 'json', 'markdown']), default='text', + help='Format for detailed error output') +# NEW OPTIONS: +@click.option('--semantic/--no-semantic', default=True, + help='Enable/disable semantic validation (sections, patterns, quality)') +@click.option('--check-links', is_flag=True, + help='Enable link validation (may be slow)') +@click.option('--strict', is_flag=True, + help='Treat warnings as errors') +@pass_config +def validate(config, file_path, schema, schema_json, quiet, detailed_errors, error_format, + semantic, check_links, strict): + """ + Validate a markdown file against a JSON schema. + + ENHANCED: Now includes semantic validation of x-markitect extensions: + - Section classifications (required, recommended, optional, discouraged, improper) + - Content patterns (required_patterns, forbidden_patterns) + - Quality metrics (min_words, max_words, min_sentences) + - Link validation (internal/external) + + Examples: + # Structural + semantic validation (default) + markitect validate doc.md --schema manpage-schema-v1.0.md + + # Only structural validation (classic mode) + markitect validate doc.md --schema schema.json --no-semantic + + # With link checking + markitect validate doc.md --schema 1 --check-links + + # Strict mode (warnings become errors) + markitect validate doc.md --schema manpage-schema-v1.0.md --strict + """ + # Existing structural validation code... + # (Keep all existing logic for SchemaValidator) + + # NEW: Add semantic validation if enabled and schema has x-markitect extensions + if semantic: + semantic_validator = SemanticValidator(schema_path) + semantic_report = semantic_validator.validate(file_path, check_links=check_links) + + # Combine structural and semantic results + combined_report = CombinedValidationReport(structural_result, semantic_report) + + # Output combined results + if not quiet: + click.echo(combined_report.format(error_format)) + + # Exit codes + if combined_report.has_errors(): + sys.exit(1) + elif strict and combined_report.has_warnings(): + sys.exit(1) +``` + +**Integration Strategy**: +1. Keep existing structural validation (SchemaValidator) unchanged +2. Add new semantic validation layer on top +3. Use --no-semantic flag to disable new validation (backward compatibility) +4. Combine structural + semantic results in unified report +5. Default to semantic=True for new markdown schemas with extensions + +**Output Format** (text): +``` +Validating: my-command.1.md +Schema: manpage-schema-v1.0.md (v1.0.0) + +Section Validation: + ✅ SYNOPSIS - Present (required) + ✅ DESCRIPTION - Present (required) + ⚠️ EXAMPLES - Missing (recommended) + ❌ INTERNAL_NOTES - Must not appear (improper) + +Content Validation: + ✅ SYNOPSIS - Patterns matched + ⚠️ DESCRIPTION - Too short (35 words, minimum 50) + ❌ SYNOPSIS - Forbidden pattern found: "TODO" + +Link Validation: (skipped - use --check-links) + +Summary: + Errors: 2 + Warnings: 2 + Status: FAILED ❌ + +Failed validations: + Line 12: INTERNAL_NOTES section must not appear in published manpages + Line 5: SYNOPSIS contains forbidden pattern "TODO" +``` + +### Phase 6: Batch Document Validation + +**New Command**: `markitect validate-batch` + +```python +@cli.command('validate-batch') +@click.argument('directory', type=click.Path(exists=True, file_okay=False)) +@click.option('--schema', '-s', type=str, required=True) +@click.option('--pattern', default='*.md', help='File pattern to match') +@click.option('--strict', is_flag=True) +@click.option('--summary-only', is_flag=True, help='Show only summary table') +@pass_config +def validate_batch_cmd(config, directory, schema, pattern, strict, summary_only): + """Validate multiple documents in a directory. + + Example: + markitect validate-batch docs/manpages/ --schema manpage-schema-v1.0.md + """ + # Find all matching documents + docs = list(Path(directory).glob(pattern)) + + # Validate each + results = [] + for doc in docs: + validator = DocumentValidator(schema) + report = validator.validate(doc) + results.append((doc.name, report)) + + # Show summary table + display_batch_results(results) +``` + +## Implementation Phases + +### Phase 1 (Core - 1 session) +- DocumentValidator class +- Basic section validation +- CLI validate command +- Simple text output format + +### Phase 2 (Content - 1 session) +- ContentValidator with pattern matching +- Word count validation +- Quality metrics checking +- Enhanced reporting + +### Phase 3 (Links - 1 session) +- LinkValidator with internal link checking +- Optional external link validation +- Fragment validation +- Performance optimization (caching) + +### Phase 4 (Polish - 1 session) +- Batch validation support +- JSON/table output formats +- Integration tests +- Documentation updates + +## Critical Files + +**New Files**: +- `markitect/semantic_validator.py` - Main semantic validator (complements existing SchemaValidator) +- `markitect/validators/section_validator.py` - Section classification enforcement +- `markitect/validators/content_validator.py` - Content pattern matching and quality +- `markitect/validators/link_validator.py` - Link validation +- `markitect/validators/__init__.py` - Validators package +- `tests/test_semantic_validator.py` - Semantic validator tests +- `tests/validators/test_section_validator.py` - Section validator tests +- `tests/validators/test_content_validator.py` - Content validator tests +- `tests/validators/test_link_validator.py` - Link validator tests + +**Modified Files**: +- `markitect/cli.py` (lines 1493-1600) - Enhance validate command with semantic validation +- `markitect/schema_loader.py` - May need utility to extract x-markitect extensions +- `docs/SCHEMA_MANAGEMENT_GUIDE.md` - Add semantic validation section +- `examples/manpages/README.md` - Add validation examples +- `examples/terminology/README.md` - Add validation examples + +**Reference Files** (unchanged, used for integration): +- `markitect/validator.py` - Existing SchemaValidator for structural validation +- `markitect/schema_analyzer.py` - Reference for schema extension parsing + +## Design Decisions + +### 1. Markdown Parsing +**Decision**: Use existing markdown parser from markitect core +**Rationale**: Already handles frontmatter, sections, AST generation + +### 2. Link Validation Default +**Decision**: Internal links checked by default, external links opt-in +**Rationale**: External link checking is slow (network requests), internal is fast + +### 3. Severity Levels +**Decision**: ERROR (required violations), WARNING (recommended violations), INFO (suggestions) +**Rationale**: Matches schema classification system semantics + +### 4. Exit Codes +**Decision**: 0=success, 1=validation failed, 2=system error +**Rationale**: Standard CLI conventions for CI/CD integration + +### 5. Pattern Syntax +**Decision**: Use Python regex patterns directly +**Rationale**: Schemas already use regex strings, no need for new syntax + +## Testing Strategy + +### Unit Tests +- SectionValidator: Test all classification types +- ContentValidator: Test pattern matching, word counts +- LinkValidator: Test internal/external link checking +- ValidationReport: Test formatting and aggregation + +### Integration Tests +- Validate real manpage documents against manpage schema +- Validate terminology documents against terminology schema +- Test batch validation across multiple documents +- Test CLI output formats + +### Edge Cases +- Documents with no schema sections defined +- Schemas with no content-control rules +- Empty documents +- Documents with malformed links +- Unicode in patterns and content + +## User Workflows + +### Workflow 1: Validate Single Document +```bash +# Validate a manpage +markitect validate my-command.1.md --schema manpage-schema-v1.0.md + +# With link checking +markitect validate my-command.1.md --schema 1 --check-links +``` + +### Workflow 2: CI/CD Integration +```bash +#!/bin/bash +# Validate all manpages in CI +if ! markitect validate-batch docs/man/ --schema 1 --strict; then + echo "Manpage validation failed!" + exit 1 +fi +``` + +### Workflow 3: Pre-commit Hook +```bash +# .git/hooks/pre-commit +files=$(git diff --cached --name-only --diff-filter=ACM | grep '\.1\.md$') +for file in $files; do + if ! markitect validate "$file" --schema manpage-schema-v1.0.md; then + echo "Fix validation errors before committing" + exit 1 + fi +done +``` + +### Workflow 4: Interactive Editing +```bash +# Validate while editing +watch -n 2 'markitect validate draft.md --schema api-documentation-schema-v1.0.md' +``` + +## Success Metrics + +1. **Core Functionality**: Can validate documents against all 4 production schemas +2. **Classification Enforcement**: Required/improper sections properly checked +3. **Pattern Matching**: Content patterns validated with regex +4. **Performance**: Validate 100 documents in < 5 seconds (without link checking) +5. **Test Coverage**: > 90% coverage for new validator modules +6. **Documentation**: Complete examples for each schema type + +## Future Enhancements (Out of Scope) + +- Auto-fixing document validation errors +- Suggestion engine for missing content +- Readability scoring with specific algorithms +- Image validation (size, format, accessibility) +- Schema evolution analysis (breaking changes between versions) +- Document-to-schema generation (inverse of current flow) diff --git a/tests/test_semantic_validator.py b/tests/test_semantic_validator.py new file mode 100644 index 00000000..ed6700e8 --- /dev/null +++ b/tests/test_semantic_validator.py @@ -0,0 +1,506 @@ +""" +Tests for SemanticValidator. + +Tests semantic validation of markdown documents against x-markitect extensions. +""" + +import pytest +from pathlib import Path +import tempfile +import json + +from markitect.semantic_validator import ( + SemanticValidator, + SemanticValidationReport, + load_schema_from_path +) +from markitect.validators.section_validator import ( + SectionValidator, + SectionMissing, + SectionImproper +) +from markitect.validators.content_validator import ( + ContentValidator, + PatternMissing, + ForbiddenPattern, + DiscouragedPattern, + ContentTooShort, + ContentTooLong +) + + +class TestSectionValidator: + """Test section validation functionality.""" + + def test_required_section_missing(self): + """Test that missing required sections are detected as errors.""" + schema = { + 'x-markitect-sections': { + 'SYNOPSIS': { + 'classification': 'required', + 'heading_level': 2, + 'error_message': 'SYNOPSIS section is mandatory' + } + } + } + + validator = SectionValidator(schema) + + # Create a mock document without SYNOPSIS + class MockDocument: + def get_headings_by_level(self, level): + return ['DESCRIPTION', 'EXAMPLES'] + + doc = MockDocument() + result = validator.check(doc) + + # Should have one error + assert not result.is_valid() + assert result.has_errors() + assert len(result.get_errors()) == 1 + + error = result.get_errors()[0] + assert isinstance(error, SectionMissing) + assert error.section_name == 'SYNOPSIS' + assert error.severity == 'ERROR' + assert 'mandatory' in error.message + + def test_improper_section_present(self): + """Test that improper sections are detected as errors.""" + schema = { + 'x-markitect-sections': { + 'INTERNAL_NOTES': { + 'classification': 'improper', + 'heading_level': 2, + 'error_message': 'Internal notes must not appear in published docs' + } + } + } + + validator = SectionValidator(schema) + + # Create a mock document with INTERNAL_NOTES + class MockDocument: + def get_headings_by_level(self, level): + return [ + { + 'content': 'INTERNAL_NOTES', + 'level': 2, + 'line_number': 25 + } + ] + + doc = MockDocument() + result = validator.check(doc) + + # Should have one error + assert not result.is_valid() + assert result.has_errors() + assert len(result.get_errors()) == 1 + + error = result.get_errors()[0] + assert isinstance(error, SectionImproper) + assert error.section_name == 'INTERNAL_NOTES' + assert error.severity == 'ERROR' + assert error.line_number == 25 + + def test_recommended_section_missing(self): + """Test that missing recommended sections generate warnings.""" + schema = { + 'x-markitect-sections': { + 'EXAMPLES': { + 'classification': 'recommended', + 'heading_level': 2, + 'warning_if_missing': 'Examples improve documentation quality' + } + } + } + + validator = SectionValidator(schema) + + # Create a mock document without EXAMPLES + class MockDocument: + def get_headings_by_level(self, level): + return ['SYNOPSIS', 'DESCRIPTION'] + + doc = MockDocument() + result = validator.check(doc) + + # Should pass validation (warnings don't fail) + assert result.is_valid() + assert not result.has_errors() + assert result.has_warnings() + assert len(result.get_warnings()) == 1 + + warning = result.get_warnings()[0] + assert warning.section_name == 'EXAMPLES' + assert warning.severity == 'WARNING' + + def test_all_required_sections_present(self): + """Test that validation passes when all required sections present.""" + schema = { + 'x-markitect-sections': { + 'SYNOPSIS': { + 'classification': 'required', + 'heading_level': 2 + }, + 'DESCRIPTION': { + 'classification': 'required', + 'heading_level': 2 + } + } + } + + validator = SectionValidator(schema) + + # Create a mock document with all required sections + class MockDocument: + def get_headings_by_level(self, level): + return [ + {'content': 'SYNOPSIS', 'level': 2}, + {'content': 'DESCRIPTION', 'level': 2}, + {'content': 'EXAMPLES', 'level': 2} + ] + + doc = MockDocument() + result = validator.check(doc) + + # Should pass + assert result.is_valid() + assert not result.has_errors() + assert not result.has_warnings() + assert len(result.issues) == 0 + + def test_section_alternatives(self): + """Test that alternative section names are recognized.""" + schema = { + 'x-markitect-sections': { + 'OPTIONS': { + 'classification': 'required', + 'heading_level': 2, + 'alternatives': ['FLAGS', 'COMMAND OPTIONS'] + } + } + } + + validator = SectionValidator(schema) + + # Document uses alternative name 'FLAGS' + class MockDocument: + def get_headings_by_level(self, level): + return [{'content': 'FLAGS', 'level': 2}] + + doc = MockDocument() + result = validator.check(doc) + + # Should pass (alternative is accepted) + assert result.is_valid() + assert not result.has_errors() + + +class TestSemanticValidator: + """Test complete semantic validation.""" + + def test_validator_initialization(self): + """Test that validator initializes correctly.""" + schema = { + '$schema': 'http://json-schema.org/draft-07/schema#', + 'x-markitect-sections': { + 'SYNOPSIS': {'classification': 'required', 'heading_level': 2} + } + } + + validator = SemanticValidator(schema) + + assert validator.schema == schema + assert validator.section_validator is not None + + def test_validation_report_formatting(self): + """Test that validation reports format correctly.""" + from markitect.validators.section_validator import ( + SectionValidationResult, + SectionMissing + ) + + section_result = SectionValidationResult( + issues=[ + SectionMissing( + section_name='SYNOPSIS', + severity='ERROR', + message='SYNOPSIS is required', + classification='required' + ) + ], + sections_checked=2, + sections_found=1 + ) + + report = SemanticValidationReport(section_result=section_result) + + # Check report properties + assert report.has_errors() + assert not report.is_valid() + + # Check text formatting + text = report.format_text() + assert 'Section Validation:' in text + assert 'SYNOPSIS' in text + assert 'Errors: 1' in text + assert 'FAILED' in text + + def test_load_json_schema(self, tmp_path): + """Test loading a JSON schema file.""" + schema_file = tmp_path / "test-schema.json" + schema_data = { + '$schema': 'http://json-schema.org/draft-07/schema#', + 'title': 'Test Schema', + 'x-markitect-sections': { + 'SYNOPSIS': {'classification': 'required', 'heading_level': 2} + } + } + + schema_file.write_text(json.dumps(schema_data, indent=2)) + + loaded_schema = load_schema_from_path(schema_file) + + assert loaded_schema == schema_data + assert 'x-markitect-sections' in loaded_schema + + def test_schema_not_found(self): + """Test that missing schema file raises error.""" + with pytest.raises(FileNotFoundError): + load_schema_from_path('/nonexistent/schema.json') + + def test_unsupported_schema_format(self, tmp_path): + """Test that unsupported format raises error.""" + schema_file = tmp_path / "schema.xml" + schema_file.write_text('') + + with pytest.raises(ValueError, match="Unsupported schema format"): + load_schema_from_path(schema_file) + + +class TestContentValidator: + """Test content validation functionality.""" + + def test_required_pattern_missing(self): + """Test that missing required patterns are detected.""" + schema = { + 'x-markitect-content-control': { + 'synopsis': { + 'required_patterns': [ + r'\*\*[a-z][a-z0-9-]*\*\*' # Bold command name + ] + } + } + } + + validator = ContentValidator(schema) + + # Create mock document without bold command + class MockDocument: + def get_section(self, name): + if name == 'SYNOPSIS': + return { + 'name': 'SYNOPSIS', + 'content': 'command [options] arguments' # No bold + } + return None + + doc = MockDocument() + result = validator.check(doc) + + # Should have one error + assert not result.is_valid() + assert result.has_errors() + assert len(result.get_errors()) == 1 + + error = result.get_errors()[0] + assert isinstance(error, PatternMissing) + assert error.section_name == 'SYNOPSIS' + assert error.severity == 'ERROR' + + def test_forbidden_pattern_found(self): + """Test that forbidden patterns are detected.""" + schema = { + 'x-markitect-content-control': { + 'description': { + 'forbidden_patterns': [ + r'\bTODO\b', + r'\bFIXME\b' + ] + } + } + } + + validator = ContentValidator(schema) + + # Create mock document with forbidden pattern + class MockDocument: + def get_section(self, name): + if name == 'DESCRIPTION': + return { + 'name': 'DESCRIPTION', + 'content': 'This is a description. TODO: Add more details.' + } + return None + + doc = MockDocument() + result = validator.check(doc) + + # Should have one error + assert not result.is_valid() + assert result.has_errors() + assert len(result.get_errors()) == 1 + + error = result.get_errors()[0] + assert isinstance(error, ForbiddenPattern) + assert error.section_name == 'DESCRIPTION' + assert 'TODO' in error.matched_text + + def test_discouraged_pattern_warning(self): + """Test that discouraged patterns generate warnings.""" + schema = { + 'x-markitect-content-control': { + 'description': { + 'discouraged_patterns': [ + r'\bWIP\b' + ] + } + } + } + + validator = ContentValidator(schema) + + # Create mock document with discouraged pattern + class MockDocument: + def get_section(self, name): + if name == 'DESCRIPTION': + return { + 'name': 'DESCRIPTION', + 'content': 'This is WIP content.' + } + return None + + doc = MockDocument() + result = validator.check(doc) + + # Should pass (warnings don't fail) + assert result.is_valid() + assert not result.has_errors() + assert result.has_warnings() + + warning = result.get_warnings()[0] + assert isinstance(warning, DiscouragedPattern) + assert warning.severity == 'WARNING' + + def test_content_too_short(self): + """Test word count validation - too short.""" + schema = { + 'x-markitect-content-control': { + 'description': { + 'content_quality': { + 'min_words': 50, + 'max_words': 1000 + } + } + } + } + + validator = ContentValidator(schema) + + # Create mock document with short content + class MockDocument: + def get_section(self, name): + if name == 'DESCRIPTION': + return { + 'name': 'DESCRIPTION', + 'content': 'Short description.' # Only 2 words + } + return None + + doc = MockDocument() + result = validator.check(doc) + + # Should have warning + assert result.is_valid() # Warnings don't fail + assert result.has_warnings() + + warning = result.get_warnings()[0] + assert isinstance(warning, ContentTooShort) + assert warning.actual == 2 + assert warning.required == 50 + + def test_content_too_long(self): + """Test word count validation - too long.""" + schema = { + 'x-markitect-content-control': { + 'synopsis': { + 'content_quality': { + 'min_words': 5, + 'max_words': 20 + } + } + } + } + + validator = ContentValidator(schema) + + # Create mock document with long content + class MockDocument: + def get_section(self, name): + if name == 'SYNOPSIS': + return { + 'name': 'SYNOPSIS', + 'content': ' '.join(['word'] * 50) # 50 words + } + return None + + doc = MockDocument() + result = validator.check(doc) + + # Should have warning + assert result.is_valid() + assert result.has_warnings() + + warning = result.get_warnings()[0] + assert isinstance(warning, ContentTooLong) + assert warning.actual == 50 + assert warning.limit == 20 + + def test_all_content_requirements_met(self): + """Test that validation passes when all requirements met.""" + schema = { + 'x-markitect-content-control': { + 'synopsis': { + 'required_patterns': [ + r'\*\*[a-z]+\*\*' + ], + 'content_quality': { + 'min_words': 5, + 'max_words': 50 + } + } + } + } + + validator = ContentValidator(schema) + + # Create valid document + class MockDocument: + def get_section(self, name): + if name == 'SYNOPSIS': + return { + 'name': 'SYNOPSIS', + 'content': '**command** [options] arguments and more words here' + } + return None + + doc = MockDocument() + result = validator.check(doc) + + # Should pass + assert result.is_valid() + assert not result.has_errors() + assert not result.has_warnings() + assert len(result.issues) == 0