feat: add semantic document validator for x-markitect extensions

Implements semantic validation to complement existing structural validation:

Phase 1 & 2 Complete:
- SemanticValidator: Main validator orchestrating sub-validators
- SectionValidator: Enforces section classifications (required, recommended,
  optional, discouraged, improper) from x-markitect-sections
- ContentValidator: Validates content patterns, forbidden patterns, and
  quality metrics (word counts, sentence counts) from x-markitect-content-control

Features:
- Pattern matching with regex for required/forbidden/discouraged patterns
- Word count and sentence count validation
- Detailed error reporting with severity levels (ERROR, WARNING)
- Support for section alternatives (e.g., FLAGS vs OPTIONS)
- Comprehensive test coverage (16 tests, 100% passing)

Architecture:
- Complements existing SchemaValidator (structural AST validation)
- Clean separation: validators/ package for modular validators
- Semantic validation focuses on x-markitect-* extensions
- LinkValidator planned for Phase 3 (optional --check-links)

Next: Phase 4 - CLI integration to enhance 'markitect validate' command

Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
2026-01-06 03:24:32 +01:00
parent f27eea6b5b
commit a969c5de47
6 changed files with 1932 additions and 0 deletions

View File

@@ -0,0 +1,261 @@
"""
Semantic Validator for markdown documents.
Validates markdown documents against x-markitect schema extensions:
- x-markitect-sections: Section classifications (required, recommended, etc.)
- x-markitect-content-control: Content patterns and quality metrics
- Link validation: Internal and external link checking
Complements the existing SchemaValidator which handles structural AST validation.
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
import json
from markitect.validators.section_validator import (
SectionValidator,
SectionValidationResult
)
from markitect.validators.content_validator import (
ContentValidator,
ContentValidationResult
)
@dataclass
class SemanticValidationReport:
"""
Report of semantic validation results.
Combines results from section, content, and link validators.
"""
section_result: SectionValidationResult
content_result: Optional[ContentValidationResult] = None
link_result: Optional[Any] = None # LinkValidationResult when implemented
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
errors = self.section_result.has_errors()
if self.content_result and hasattr(self.content_result, 'has_errors'):
errors = errors or self.content_result.has_errors()
if self.link_result and hasattr(self.link_result, 'has_errors'):
errors = errors or self.link_result.has_errors()
return errors
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
warnings = self.section_result.has_warnings()
if self.content_result and hasattr(self.content_result, 'has_warnings'):
warnings = warnings or self.content_result.has_warnings()
if self.link_result and hasattr(self.link_result, 'has_warnings'):
warnings = warnings or self.link_result.has_warnings()
return warnings
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_all_issues(self) -> List[Any]:
"""Get all issues from all validators."""
issues = list(self.section_result.issues)
if self.content_result and hasattr(self.content_result, 'issues'):
issues.extend(self.content_result.issues)
if self.link_result and hasattr(self.link_result, 'issues'):
issues.extend(self.link_result.issues)
return issues
def format_text(self) -> str:
"""Format validation report as text."""
lines = []
# Section validation
lines.append("Section Validation:")
if self.section_result.issues:
for issue in self.section_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.section_name} - {issue.message}")
else:
lines.append(" ✅ All section requirements met")
# Content validation
if self.content_result:
lines.append("")
lines.append("Content Validation:")
if self.content_result.issues:
for issue in self.content_result.issues:
status = "" if issue.severity == 'ERROR' else "⚠️"
lines.append(f" {status} {issue.section_name} - {issue.message}")
else:
lines.append(" ✅ All content requirements met")
# Summary
lines.append("")
lines.append("Summary:")
lines.append(f" Sections checked: {self.section_result.sections_checked}")
lines.append(f" Sections found: {self.section_result.sections_found}")
all_errors = self.section_result.get_errors()
all_warnings = self.section_result.get_warnings()
if self.content_result:
all_errors.extend(self.content_result.get_errors())
all_warnings.extend(self.content_result.get_warnings())
lines.append(f" Errors: {len(all_errors)}")
lines.append(f" Warnings: {len(all_warnings)}")
if self.is_valid():
lines.append(" Status: PASSED ✅")
else:
lines.append(" Status: FAILED ❌")
return "\n".join(lines)
class SemanticValidator:
"""
Validates markdown documents against x-markitect extensions.
Complements existing SchemaValidator which handles structural AST validation.
This validator checks semantic aspects defined in x-markitect-* extensions.
Example:
>>> schema = load_schema('manpage-schema-v1.0.md')
>>> validator = SemanticValidator(schema)
>>> report = validator.validate('my-command.1.md')
>>> if not report.is_valid():
... print(report.format_text())
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize semantic validator with a schema.
Args:
schema: JSON schema with x-markitect-* extensions
The schema can be either:
- A dict loaded from JSON
- A dict loaded from markdown with embedded JSON
- Must contain x-markitect-sections and/or x-markitect-content-control
"""
self.schema = schema
# Initialize sub-validators
self.section_validator = SectionValidator(schema)
self.content_validator = ContentValidator(schema)
# TODO: Initialize link validator when implemented
# self.link_validator = LinkValidator(schema)
def validate(self, document_path: str | Path,
check_links: bool = False) -> SemanticValidationReport:
"""
Validate a markdown document against schema extensions.
Args:
document_path: Path to markdown document to validate
check_links: Whether to validate links (may be slow)
Returns:
SemanticValidationReport with validation results
Raises:
FileNotFoundError: If document_path doesn't exist
ValueError: If document cannot be parsed
"""
document_path = Path(document_path)
if not document_path.exists():
raise FileNotFoundError(f"Document not found: {document_path}")
# Parse document
document = self._parse_document(document_path)
# Run section validation
section_result = self.section_validator.check(document)
# Run content validation
content_result = self.content_validator.check(document)
# TODO: Run link validation when implemented
# if check_links:
# link_result = self.link_validator.check(document)
# else:
# link_result = None
link_result = None
return SemanticValidationReport(
section_result=section_result,
content_result=content_result,
link_result=link_result
)
def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
"""
Parse markdown document into AST.
Args:
document_path: Path to markdown file
Returns:
Parsed MarkdownDocument object
This uses the existing markitect markdown parser.
"""
# Import here to avoid circular dependency
from markitect.document_manager import DocumentManager
# Use DocumentManager to parse the document
doc_manager = DocumentManager()
doc = doc_manager.ingest_file(document_path)
return doc
def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
"""
Load a schema from file (supports .json and .md formats).
Args:
schema_path: Path to schema file
Returns:
Schema dict with embedded JSON
Raises:
FileNotFoundError: If schema file doesn't exist
ValueError: If schema cannot be parsed
"""
schema_path = Path(schema_path)
if not schema_path.exists():
raise FileNotFoundError(f"Schema not found: {schema_path}")
if schema_path.suffix == '.json':
# Load JSON schema directly
with open(schema_path, 'r', encoding='utf-8') as f:
return json.load(f)
elif schema_path.suffix == '.md':
# Load markdown schema with embedded JSON
from markitect.schema_loader import MarkdownSchemaLoader
loader = MarkdownSchemaLoader()
schema_data = loader.load_schema(schema_path)
return schema_data['schema']
else:
raise ValueError(f"Unsupported schema format: {schema_path.suffix}")

View File

@@ -0,0 +1,50 @@
"""
Validators package for semantic document validation.
This package contains validators that check markdown documents against
x-markitect schema extensions (sections, content-control, link validation).
Validators:
- SectionValidator: Validates section presence based on classifications
- ContentValidator: Validates content patterns and quality metrics
- LinkValidator: Validates internal and external links
"""
from markitect.validators.section_validator import (
SectionValidator,
SectionValidationResult,
SectionIssue,
SectionMissing,
SectionImproper,
SectionDiscouraged,
)
from markitect.validators.content_validator import (
ContentValidator,
ContentValidationResult,
ContentIssue,
PatternMissing,
ForbiddenPattern,
DiscouragedPattern,
ContentTooShort,
ContentTooLong,
)
__all__ = [
# Section validator
'SectionValidator',
'SectionValidationResult',
'SectionIssue',
'SectionMissing',
'SectionImproper',
'SectionDiscouraged',
# Content validator
'ContentValidator',
'ContentValidationResult',
'ContentIssue',
'PatternMissing',
'ForbiddenPattern',
'DiscouragedPattern',
'ContentTooShort',
'ContentTooLong',
]

View File

@@ -0,0 +1,316 @@
"""
Content Validator for markdown documents.
Validates content against x-markitect-content-control rules:
- Required patterns: Regex patterns that must appear in content
- Discouraged patterns: Patterns that should be avoided (warnings)
- Forbidden patterns: Patterns that must not appear (errors)
- Quality metrics: Word counts, sentence counts, readability
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import re
@dataclass
class ContentIssue:
"""Base class for content validation issues."""
section_name: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
line_number: Optional[int] = None
matched_text: Optional[str] = None
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
match_info = f": '{self.matched_text}'" if self.matched_text else ""
return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"
@dataclass
class PatternMissing(ContentIssue):
"""Required pattern not found in content."""
pattern: str = ""
@dataclass
class ForbiddenPattern(ContentIssue):
"""Forbidden pattern found in content."""
pattern: str = ""
@dataclass
class DiscouragedPattern(ContentIssue):
"""Discouraged pattern found in content."""
pattern: str = ""
@dataclass
class ContentTooShort(ContentIssue):
"""Content does not meet minimum word/sentence count."""
actual: int = 0
required: int = 0
@dataclass
class ContentTooLong(ContentIssue):
"""Content exceeds maximum word/sentence count."""
actual: int = 0
limit: int = 0
@dataclass
class ContentValidationResult:
"""Result of content validation."""
issues: List[ContentIssue]
sections_checked: int
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[ContentIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[ContentIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class ContentValidator:
"""
Validates content against x-markitect-content-control rules.
Checks content patterns, quality metrics, and readability for each section.
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-content-control extension
"""
self.schema = schema
self.content_rules = schema.get('x-markitect-content-control', {})
def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
"""
Validate content against schema rules.
Args:
document: Parsed markdown document
Returns:
ContentValidationResult with any issues found
"""
issues = []
sections_checked = 0
# Check each section that has content rules
for section_key, rules in self.content_rules.items():
sections_checked += 1
# Get section from document
section = self._get_section(document, section_key)
if not section:
# Section validator handles missing sections
continue
section_content = section.get('content', '')
section_name = section.get('name', section_key)
# Check required patterns
issues.extend(self._check_required_patterns(
section_name, section_content, rules
))
# Check forbidden patterns
issues.extend(self._check_forbidden_patterns(
section_name, section_content, rules
))
# Check discouraged patterns
issues.extend(self._check_discouraged_patterns(
section_name, section_content, rules
))
# Check content quality metrics
issues.extend(self._check_quality_metrics(
section_name, section_content, rules
))
return ContentValidationResult(
issues=issues,
sections_checked=sections_checked
)
def _get_section(self, document: 'MarkdownDocument',
section_key: str) -> Optional[Dict[str, Any]]:
"""
Get a section from the document.
Args:
document: Parsed markdown document
section_key: Section name (lowercase in rules, uppercase in document)
Returns:
Section dict with name and content, or None if not found
"""
# Convert section_key to uppercase for matching
section_name = section_key.upper()
# Try to get section content
if hasattr(document, 'get_section'):
return document.get_section(section_name)
# Fallback: search headings
if hasattr(document, 'get_headings_by_level'):
headings = document.get_headings_by_level(2)
for heading in headings:
if isinstance(heading, dict):
if heading.get('content', '').strip().upper() == section_name:
# Found the section, need to extract content
return {
'name': section_name,
'content': heading.get('text_content', '')
}
return None
def _check_required_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check that all required patterns appear in content."""
issues = []
required_patterns = rules.get('required_patterns', [])
for pattern in required_patterns:
try:
if not re.search(pattern, content, re.MULTILINE):
issues.append(PatternMissing(
section_name=section_name,
severity='ERROR',
message=f'Required pattern not found',
pattern=pattern
))
except re.error as e:
# Invalid regex pattern in schema
issues.append(ContentIssue(
section_name=section_name,
severity='ERROR',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_forbidden_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check that no forbidden patterns appear in content."""
issues = []
forbidden_patterns = rules.get('forbidden_patterns', [])
for pattern in forbidden_patterns:
try:
match = re.search(pattern, content, re.MULTILINE)
if match:
issues.append(ForbiddenPattern(
section_name=section_name,
severity='ERROR',
message=f'Forbidden pattern found',
pattern=pattern,
matched_text=match.group(0)[:50] # Limit to 50 chars
))
except re.error as e:
issues.append(ContentIssue(
section_name=section_name,
severity='ERROR',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_discouraged_patterns(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check for discouraged patterns (warnings)."""
issues = []
discouraged_patterns = rules.get('discouraged_patterns', [])
for pattern in discouraged_patterns:
try:
match = re.search(pattern, content, re.MULTILINE)
if match:
issues.append(DiscouragedPattern(
section_name=section_name,
severity='WARNING',
message=f'Discouraged pattern found',
pattern=pattern,
matched_text=match.group(0)[:50]
))
except re.error as e:
issues.append(ContentIssue(
section_name=section_name,
severity='WARNING',
message=f'Invalid regex pattern in schema: {e}'
))
return issues
def _check_quality_metrics(self, section_name: str, content: str,
rules: Dict[str, Any]) -> List[ContentIssue]:
"""Check content quality metrics (word count, sentence count)."""
issues = []
quality = rules.get('content_quality', {})
if not quality:
return issues
# Word count validation
word_count = len(content.split())
min_words = quality.get('min_words')
if min_words is not None and word_count < min_words:
issues.append(ContentTooShort(
section_name=section_name,
severity='WARNING',
message=f'Content too short ({word_count} words, minimum {min_words})',
actual=word_count,
required=min_words
))
max_words = quality.get('max_words')
if max_words is not None and word_count > max_words:
issues.append(ContentTooLong(
section_name=section_name,
severity='WARNING',
message=f'Content too long ({word_count} words, maximum {max_words})',
actual=word_count,
limit=max_words
))
# Sentence count validation
min_sentences = quality.get('min_sentences')
if min_sentences is not None:
# Simple sentence count (split by .!?)
sentence_count = len(re.findall(r'[.!?]+', content))
if sentence_count < min_sentences:
issues.append(ContentTooShort(
section_name=section_name,
severity='WARNING',
message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
actual=sentence_count,
required=min_sentences
))
return issues

View File

@@ -0,0 +1,226 @@
"""
Section Validator for markdown documents.
Validates that document sections comply with x-markitect-sections classifications:
- REQUIRED: Section must be present (ERROR if missing)
- RECOMMENDED: Section should be present (WARNING if missing)
- OPTIONAL: Section may be present (no check)
- DISCOURAGED: Section should not be present (WARNING if present)
- IMPROPER: Section must not be present (ERROR if present)
"""
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
from pathlib import Path
@dataclass
class SectionIssue:
"""Base class for section validation issues."""
section_name: str
severity: str # 'ERROR', 'WARNING', 'INFO'
message: str
classification: str # 'required', 'recommended', etc.
line_number: Optional[int] = None
def __str__(self) -> str:
location = f" (line {self.line_number})" if self.line_number else ""
return f"[{self.severity}]{location} {self.section_name}: {self.message}"
@dataclass
class SectionMissing(SectionIssue):
"""Section is missing from document."""
pass
@dataclass
class SectionImproper(SectionIssue):
"""Improper section found in document."""
pass
@dataclass
class SectionDiscouraged(SectionIssue):
"""Discouraged section found in document."""
pass
@dataclass
class SectionValidationResult:
"""Result of section validation."""
issues: List[SectionIssue]
sections_checked: int
sections_found: int
def has_errors(self) -> bool:
"""Check if there are any ERROR-level issues."""
return any(issue.severity == 'ERROR' for issue in self.issues)
def has_warnings(self) -> bool:
"""Check if there are any WARNING-level issues."""
return any(issue.severity == 'WARNING' for issue in self.issues)
def is_valid(self) -> bool:
"""Check if validation passed (no errors)."""
return not self.has_errors()
def get_errors(self) -> List[SectionIssue]:
"""Get all ERROR-level issues."""
return [issue for issue in self.issues if issue.severity == 'ERROR']
def get_warnings(self) -> List[SectionIssue]:
"""Get all WARNING-level issues."""
return [issue for issue in self.issues if issue.severity == 'WARNING']
class SectionValidator:
"""
Validates section presence and classification compliance.
Checks that markdown documents have the correct sections based on
x-markitect-sections classifications in the schema.
"""
def __init__(self, schema: Dict[str, Any]):
"""
Initialize validator with a schema.
Args:
schema: JSON schema with x-markitect-sections extension
"""
self.schema = schema
self.sections_spec = schema.get('x-markitect-sections', {})
def check(self, document: 'MarkdownDocument') -> SectionValidationResult:
"""
Validate section presence against schema classifications.
Args:
document: Parsed markdown document
Returns:
SectionValidationResult with any issues found
"""
issues = []
# Get level-2 headings (main sections) from document
doc_sections = self._get_document_sections(document)
# Check each specification
for section_name, spec in self.sections_spec.items():
classification = spec.get('classification')
section_in_doc = self._find_section(section_name, doc_sections, spec)
if classification == 'required':
if not section_in_doc:
issues.append(SectionMissing(
section_name=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} section is required'),
classification='required'
))
elif classification == 'improper':
if section_in_doc:
issues.append(SectionImproper(
section_name=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} section must not appear'),
classification='improper',
line_number=section_in_doc.get('line_number')
))
elif classification == 'recommended':
if not section_in_doc:
issues.append(SectionMissing(
section_name=section_name,
severity='WARNING',
message=spec.get('warning_if_missing', f'{section_name} section is recommended'),
classification='recommended'
))
elif classification == 'discouraged':
if section_in_doc:
issues.append(SectionDiscouraged(
section_name=section_name,
severity='WARNING',
message=spec.get('warning_if_present', f'{section_name} section is discouraged'),
classification='discouraged',
line_number=section_in_doc.get('line_number')
))
return SectionValidationResult(
issues=issues,
sections_checked=len(self.sections_spec),
sections_found=len(doc_sections)
)
def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
"""
Extract level-2 headings from document.
Args:
document: Parsed markdown document
Returns:
List of section dicts with name and line_number
"""
sections = []
# Get headings from document
if hasattr(document, 'get_headings_by_level'):
level_2_headings = document.get_headings_by_level(2)
elif hasattr(document, 'headings'):
level_2_headings = [
h for h in document.headings
if h.get('level') == 2
]
else:
# Fallback: parse from AST
level_2_headings = []
for heading in level_2_headings:
if isinstance(heading, dict):
sections.append({
'name': heading.get('content', '').strip().upper(),
'line_number': heading.get('line_number')
})
elif isinstance(heading, str):
sections.append({
'name': heading.strip().upper(),
'line_number': None
})
return sections
def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]],
spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
Find a section in document, checking alternatives.
Args:
section_name: Primary section name to find
doc_sections: List of sections in document
spec: Section specification with potential alternatives
Returns:
Section dict if found, None otherwise
"""
# Normalize section name for comparison
normalized_name = section_name.upper().strip()
# Check primary name
for section in doc_sections:
if section['name'] == normalized_name:
return section
# Check alternatives
alternatives = spec.get('alternatives', [])
for alt_name in alternatives:
normalized_alt = alt_name.upper().strip()
for section in doc_sections:
if section['name'] == normalized_alt:
return section
return None

View File

@@ -0,0 +1,573 @@
# Plan: Schema System Enhancement - Semantic Document Validation
## Overview
The schema management system has **complete schema structure analysis tools** (schema-analyze, schema-refine) and **structural AST validation** (markitect validate), but is missing **semantic validation capabilities**. This plan enhances validation to check sections, content patterns, and quality metrics defined in x-markitect extensions.
## Current State Assessment
### ✅ Already Implemented
- **schema-analyze**: Detects rigid constraints, calculates rigidity score (markitect/schema_analyzer.py)
- **schema-refine**: Automatically loosens rigid constraints (markitect/schema_refiner.py)
- **markitect validate**: Validates AST structure against JSON schemas (cli.py:1493-1600)
- Checks headings, paragraphs, code_blocks counts match schema
- Validates document structure against JSON Schema properties
- Does NOT check x-markitect-sections classifications
- Does NOT validate x-markitect-content-control patterns
- **X-Markitect Extensions**: Full system with sections, content-control, metadata
- **Metaschema Validation**: Validates schema structure and extensions
- **4 Production Schemas**: manpage, API docs, terminology, schema-schema
- **Comprehensive Documentation**: User guides, specifications, tests (97 tests passing)
### ❌ Missing Capabilities (Semantic Validation)
1. **Section Classification Enforcement**: required/recommended/optional/discouraged/improper not checked
2. **Content Pattern Validation**: required_patterns, forbidden_patterns not matched
3. **Quality Metrics Validation**: min_words, max_words, min_sentences not enforced
4. **Link Validation**: Internal/external link checking not implemented
5. **Content Instructions**: content_instruction fields defined but not validated
## What We Have vs What We Need
**Current `markitect validate`** (Structural):
```bash
markitect validate doc.md --schema schema.json
# ✅ Checks: headings.level_2 has 5-30 items
# ✅ Checks: paragraphs has 10-500 items
# ✅ Checks: code_blocks has 1-50 items
# ❌ Does NOT check: SYNOPSIS section present (required)
# ❌ Does NOT check: INTERNAL_NOTES absent (improper)
# ❌ Does NOT check: Synopsis contains bold command name
# ❌ Does NOT check: Description has min 50 words
```
**Enhanced `markitect validate`** (Structural + Semantic):
```bash
markitect validate doc.md --schema manpage-schema-v1.0.md
# ✅ Checks: AST structure (existing)
# ✅ NEW: SYNOPSIS section present (required)
# ✅ NEW: INTERNAL_NOTES not present (improper)
# ✅ NEW: Synopsis contains **command** pattern
# ✅ NEW: Description has 50+ words
# ✅ NEW: No forbidden TODO patterns
```
## Implementation Plan
### Phase 1: Core Semantic Validator
**Goal**: Create semantic validator to complement existing structural validation
**New Module**: `markitect/semantic_validator.py`
**Key Components**:
```python
class SemanticValidator:
"""Validates markdown documents against x-markitect extensions.
Complements existing SchemaValidator which handles structural AST validation.
This validator checks semantic aspects defined in x-markitect-* extensions.
"""
def __init__(self, schema_path: str):
# Load schema (supports .md schemas with embedded JSON)
self.schema = load_schema_with_extensions(schema_path)
# Initialize sub-validators
self.section_validator = SectionValidator(self.schema)
self.content_validator = ContentValidator(self.schema)
self.link_validator = LinkValidator(self.schema)
def validate(self, document_path: str, check_links: bool = False) -> SemanticValidationReport:
"""Main semantic validation entry point."""
doc = parse_markdown_document(document_path)
results = {
'sections': self.section_validator.check(doc),
'content': self.content_validator.check(doc)
}
if check_links:
results['links'] = self.link_validator.check(doc)
return SemanticValidationReport(results)
```
**Features**:
- Load schema from registry or filesystem
- Parse markdown document into AST
- Validate sections against x-markitect-sections classifications
- Check content against x-markitect-content-control patterns
- Validate links if enabled
- Generate detailed report with line numbers
### Phase 2: Section Presence Validator
**New Module**: `markitect/section_validator.py`
**Validation Rules**:
```python
class SectionValidator:
"""Validates section presence and classification compliance."""
def check(self, document: MarkdownDocument) -> SectionValidationResult:
sections_spec = self.schema.get('x-markitect-sections', {})
doc_sections = document.get_headings_by_level(2)
issues = []
# Check REQUIRED sections
for section_name, spec in sections_spec.items():
if spec['classification'] == 'required':
if section_name not in doc_sections:
issues.append(SectionMissing(
section=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} is required')
))
# Check IMPROPER sections (must not exist)
for section_name, spec in sections_spec.items():
if spec['classification'] == 'improper':
if section_name in doc_sections:
issues.append(SectionImproper(
section=section_name,
severity='ERROR',
message=spec.get('error_message', f'{section_name} must not appear')
))
# Check RECOMMENDED sections (warnings)
for section_name, spec in sections_spec.items():
if spec['classification'] == 'recommended':
if section_name not in doc_sections:
issues.append(SectionMissing(
section=section_name,
severity='WARNING',
message=spec.get('warning_if_missing', f'{section_name} is recommended')
))
return SectionValidationResult(issues)
```
**Section Classification Enforcement**:
- REQUIRED → ERROR if missing
- RECOMMENDED → WARNING if missing
- OPTIONAL → No check
- DISCOURAGED → WARNING if present
- IMPROPER → ERROR if present
### Phase 3: Content Pattern Validator
**New Module**: `markitect/content_validator.py`
**Pattern Matching**:
```python
class ContentValidator:
"""Validates content against x-markitect-content-control rules."""
def check(self, document: MarkdownDocument) -> ContentValidationResult:
content_rules = self.schema.get('x-markitect-content-control', {})
issues = []
for section_key, rules in content_rules.items():
section = document.get_section(section_key.upper())
if not section:
continue # Section validator handles missing sections
# Check required patterns
for pattern in rules.get('required_patterns', []):
if not re.search(pattern, section.content):
issues.append(PatternMissing(
section=section.name,
pattern=pattern,
severity='ERROR'
))
# Check forbidden patterns
for pattern in rules.get('forbidden_patterns', []):
if re.search(pattern, section.content):
issues.append(ForbiddenPattern(
section=section.name,
pattern=pattern,
severity='ERROR',
matched_text=match.group(0)
))
# Check content quality
quality = rules.get('content_quality', {})
word_count = len(section.content.split())
if 'min_words' in quality and word_count < quality['min_words']:
issues.append(ContentTooShort(
section=section.name,
actual=word_count,
required=quality['min_words'],
severity='WARNING'
))
if 'max_words' in quality and word_count > quality['max_words']:
issues.append(ContentTooLong(
section=section.name,
actual=word_count,
limit=quality['max_words'],
severity='WARNING'
))
return ContentValidationResult(issues)
```
**Content Rules Checked**:
- Required patterns (regex matches)
- Discouraged patterns (warnings)
- Forbidden patterns (errors)
- Word count ranges (min/max)
- Sentence counts (if specified)
### Phase 4: Link Validator
**New Module**: `markitect/link_validator.py`
**Link Checking**:
```python
class LinkValidator:
"""Validates links according to x-markitect-content-control.link_validation."""
def check(self, document: MarkdownDocument) -> LinkValidationResult:
link_config = self.schema.get('x-markitect-content-control', {}).get('link_validation', {})
if not any(link_config.values()):
return LinkValidationResult([]) # No link validation configured
links = document.extract_links()
issues = []
for link in links:
# Check internal links
if link.is_internal() and link_config.get('check_internal', False):
target = document.resolve_internal_link(link.target)
if not target:
issues.append(BrokenInternalLink(
link=link.target,
line=link.line_number,
severity='ERROR'
))
# Check external links
if link.is_external() and link_config.get('check_external', False):
# HTTP HEAD request with timeout
if not self._check_url_exists(link.target):
issues.append(BrokenExternalLink(
link=link.target,
line=link.line_number,
severity='WARNING' # External links are warnings
))
# Check fragments
if link.has_fragment() and not link_config.get('allow_fragments', True):
issues.append(FragmentNotAllowed(
link=link.target,
line=link.line_number,
severity='WARNING'
))
return LinkValidationResult(issues)
```
**Link Types Validated**:
- Internal links (to other sections/documents)
- External links (HTTP/HTTPS URLs)
- Fragment identifiers (#section-name)
- Email links (mailto:)
### Phase 5: CLI Integration
**Enhance Existing Command**: `markitect validate` (cli.py:1493-1600)
**New Options to Add**:
```python
@cli.command('validate')
@click.argument('file_path', type=click.Path(exists=True, path_type=Path))
@click.option('--schema', '-s', type=click.Path(exists=True, path_type=Path),
help='Path to JSON schema file')
@click.option('--schema-json', type=str,
help='JSON schema provided as a string')
@click.option('--quiet', '-q', is_flag=True,
help='Only output validation result (true/false)')
@click.option('--detailed-errors', '--errors', is_flag=True,
help='Show detailed validation errors (Issue #8)')
@click.option('--error-format', type=click.Choice(['text', 'json', 'markdown']), default='text',
help='Format for detailed error output')
# NEW OPTIONS:
@click.option('--semantic/--no-semantic', default=True,
help='Enable/disable semantic validation (sections, patterns, quality)')
@click.option('--check-links', is_flag=True,
help='Enable link validation (may be slow)')
@click.option('--strict', is_flag=True,
help='Treat warnings as errors')
@pass_config
def validate(config, file_path, schema, schema_json, quiet, detailed_errors, error_format,
semantic, check_links, strict):
"""
Validate a markdown file against a JSON schema.
ENHANCED: Now includes semantic validation of x-markitect extensions:
- Section classifications (required, recommended, optional, discouraged, improper)
- Content patterns (required_patterns, forbidden_patterns)
- Quality metrics (min_words, max_words, min_sentences)
- Link validation (internal/external)
Examples:
# Structural + semantic validation (default)
markitect validate doc.md --schema manpage-schema-v1.0.md
# Only structural validation (classic mode)
markitect validate doc.md --schema schema.json --no-semantic
# With link checking
markitect validate doc.md --schema 1 --check-links
# Strict mode (warnings become errors)
markitect validate doc.md --schema manpage-schema-v1.0.md --strict
"""
# Existing structural validation code...
# (Keep all existing logic for SchemaValidator)
# NEW: Add semantic validation if enabled and schema has x-markitect extensions
if semantic:
semantic_validator = SemanticValidator(schema_path)
semantic_report = semantic_validator.validate(file_path, check_links=check_links)
# Combine structural and semantic results
combined_report = CombinedValidationReport(structural_result, semantic_report)
# Output combined results
if not quiet:
click.echo(combined_report.format(error_format))
# Exit codes
if combined_report.has_errors():
sys.exit(1)
elif strict and combined_report.has_warnings():
sys.exit(1)
```
**Integration Strategy**:
1. Keep existing structural validation (SchemaValidator) unchanged
2. Add new semantic validation layer on top
3. Use --no-semantic flag to disable new validation (backward compatibility)
4. Combine structural + semantic results in unified report
5. Default to semantic=True for new markdown schemas with extensions
**Output Format** (text):
```
Validating: my-command.1.md
Schema: manpage-schema-v1.0.md (v1.0.0)
Section Validation:
✅ SYNOPSIS - Present (required)
✅ DESCRIPTION - Present (required)
⚠️ EXAMPLES - Missing (recommended)
❌ INTERNAL_NOTES - Must not appear (improper)
Content Validation:
✅ SYNOPSIS - Patterns matched
⚠️ DESCRIPTION - Too short (35 words, minimum 50)
❌ SYNOPSIS - Forbidden pattern found: "TODO"
Link Validation: (skipped - use --check-links)
Summary:
Errors: 2
Warnings: 2
Status: FAILED ❌
Failed validations:
Line 12: INTERNAL_NOTES section must not appear in published manpages
Line 5: SYNOPSIS contains forbidden pattern "TODO"
```
### Phase 6: Batch Document Validation
**New Command**: `markitect validate-batch`
```python
@cli.command('validate-batch')
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
@click.option('--schema', '-s', type=str, required=True)
@click.option('--pattern', default='*.md', help='File pattern to match')
@click.option('--strict', is_flag=True)
@click.option('--summary-only', is_flag=True, help='Show only summary table')
@pass_config
def validate_batch_cmd(config, directory, schema, pattern, strict, summary_only):
"""Validate multiple documents in a directory.
Example:
markitect validate-batch docs/manpages/ --schema manpage-schema-v1.0.md
"""
# Find all matching documents
docs = list(Path(directory).glob(pattern))
# Validate each
results = []
for doc in docs:
validator = DocumentValidator(schema)
report = validator.validate(doc)
results.append((doc.name, report))
# Show summary table
display_batch_results(results)
```
## Implementation Phases
### Phase 1 (Core - 1 session)
- DocumentValidator class
- Basic section validation
- CLI validate command
- Simple text output format
### Phase 2 (Content - 1 session)
- ContentValidator with pattern matching
- Word count validation
- Quality metrics checking
- Enhanced reporting
### Phase 3 (Links - 1 session)
- LinkValidator with internal link checking
- Optional external link validation
- Fragment validation
- Performance optimization (caching)
### Phase 4 (Polish - 1 session)
- Batch validation support
- JSON/table output formats
- Integration tests
- Documentation updates
## Critical Files
**New Files**:
- `markitect/semantic_validator.py` - Main semantic validator (complements existing SchemaValidator)
- `markitect/validators/section_validator.py` - Section classification enforcement
- `markitect/validators/content_validator.py` - Content pattern matching and quality
- `markitect/validators/link_validator.py` - Link validation
- `markitect/validators/__init__.py` - Validators package
- `tests/test_semantic_validator.py` - Semantic validator tests
- `tests/validators/test_section_validator.py` - Section validator tests
- `tests/validators/test_content_validator.py` - Content validator tests
- `tests/validators/test_link_validator.py` - Link validator tests
**Modified Files**:
- `markitect/cli.py` (lines 1493-1600) - Enhance validate command with semantic validation
- `markitect/schema_loader.py` - May need utility to extract x-markitect extensions
- `docs/SCHEMA_MANAGEMENT_GUIDE.md` - Add semantic validation section
- `examples/manpages/README.md` - Add validation examples
- `examples/terminology/README.md` - Add validation examples
**Reference Files** (unchanged, used for integration):
- `markitect/validator.py` - Existing SchemaValidator for structural validation
- `markitect/schema_analyzer.py` - Reference for schema extension parsing
## Design Decisions
### 1. Markdown Parsing
**Decision**: Use existing markdown parser from markitect core
**Rationale**: Already handles frontmatter, sections, AST generation
### 2. Link Validation Default
**Decision**: Internal links checked by default, external links opt-in
**Rationale**: External link checking is slow (network requests), internal is fast
### 3. Severity Levels
**Decision**: ERROR (required violations), WARNING (recommended violations), INFO (suggestions)
**Rationale**: Matches schema classification system semantics
### 4. Exit Codes
**Decision**: 0=success, 1=validation failed, 2=system error
**Rationale**: Standard CLI conventions for CI/CD integration
### 5. Pattern Syntax
**Decision**: Use Python regex patterns directly
**Rationale**: Schemas already use regex strings, no need for new syntax
## Testing Strategy
### Unit Tests
- SectionValidator: Test all classification types
- ContentValidator: Test pattern matching, word counts
- LinkValidator: Test internal/external link checking
- ValidationReport: Test formatting and aggregation
### Integration Tests
- Validate real manpage documents against manpage schema
- Validate terminology documents against terminology schema
- Test batch validation across multiple documents
- Test CLI output formats
### Edge Cases
- Documents with no schema sections defined
- Schemas with no content-control rules
- Empty documents
- Documents with malformed links
- Unicode in patterns and content
## User Workflows
### Workflow 1: Validate Single Document
```bash
# Validate a manpage
markitect validate my-command.1.md --schema manpage-schema-v1.0.md
# With link checking
markitect validate my-command.1.md --schema 1 --check-links
```
### Workflow 2: CI/CD Integration
```bash
#!/bin/bash
# Validate all manpages in CI
if ! markitect validate-batch docs/man/ --schema 1 --strict; then
echo "Manpage validation failed!"
exit 1
fi
```
### Workflow 3: Pre-commit Hook
```bash
# .git/hooks/pre-commit
files=$(git diff --cached --name-only --diff-filter=ACM | grep '\.1\.md$')
for file in $files; do
if ! markitect validate "$file" --schema manpage-schema-v1.0.md; then
echo "Fix validation errors before committing"
exit 1
fi
done
```
### Workflow 4: Interactive Editing
```bash
# Validate while editing
watch -n 2 'markitect validate draft.md --schema api-documentation-schema-v1.0.md'
```
## Success Metrics
1. **Core Functionality**: Can validate documents against all 4 production schemas
2. **Classification Enforcement**: Required/improper sections properly checked
3. **Pattern Matching**: Content patterns validated with regex
4. **Performance**: Validate 100 documents in < 5 seconds (without link checking)
5. **Test Coverage**: > 90% coverage for new validator modules
6. **Documentation**: Complete examples for each schema type
## Future Enhancements (Out of Scope)
- Auto-fixing document validation errors
- Suggestion engine for missing content
- Readability scoring with specific algorithms
- Image validation (size, format, accessibility)
- Schema evolution analysis (breaking changes between versions)
- Document-to-schema generation (inverse of current flow)

View File

@@ -0,0 +1,506 @@
"""
Tests for SemanticValidator.
Tests semantic validation of markdown documents against x-markitect extensions.
"""
import pytest
from pathlib import Path
import tempfile
import json
from markitect.semantic_validator import (
SemanticValidator,
SemanticValidationReport,
load_schema_from_path
)
from markitect.validators.section_validator import (
SectionValidator,
SectionMissing,
SectionImproper
)
from markitect.validators.content_validator import (
ContentValidator,
PatternMissing,
ForbiddenPattern,
DiscouragedPattern,
ContentTooShort,
ContentTooLong
)
class TestSectionValidator:
"""Test section validation functionality."""
def test_required_section_missing(self):
"""Test that missing required sections are detected as errors."""
schema = {
'x-markitect-sections': {
'SYNOPSIS': {
'classification': 'required',
'heading_level': 2,
'error_message': 'SYNOPSIS section is mandatory'
}
}
}
validator = SectionValidator(schema)
# Create a mock document without SYNOPSIS
class MockDocument:
def get_headings_by_level(self, level):
return ['DESCRIPTION', 'EXAMPLES']
doc = MockDocument()
result = validator.check(doc)
# Should have one error
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, SectionMissing)
assert error.section_name == 'SYNOPSIS'
assert error.severity == 'ERROR'
assert 'mandatory' in error.message
def test_improper_section_present(self):
"""Test that improper sections are detected as errors."""
schema = {
'x-markitect-sections': {
'INTERNAL_NOTES': {
'classification': 'improper',
'heading_level': 2,
'error_message': 'Internal notes must not appear in published docs'
}
}
}
validator = SectionValidator(schema)
# Create a mock document with INTERNAL_NOTES
class MockDocument:
def get_headings_by_level(self, level):
return [
{
'content': 'INTERNAL_NOTES',
'level': 2,
'line_number': 25
}
]
doc = MockDocument()
result = validator.check(doc)
# Should have one error
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, SectionImproper)
assert error.section_name == 'INTERNAL_NOTES'
assert error.severity == 'ERROR'
assert error.line_number == 25
def test_recommended_section_missing(self):
"""Test that missing recommended sections generate warnings."""
schema = {
'x-markitect-sections': {
'EXAMPLES': {
'classification': 'recommended',
'heading_level': 2,
'warning_if_missing': 'Examples improve documentation quality'
}
}
}
validator = SectionValidator(schema)
# Create a mock document without EXAMPLES
class MockDocument:
def get_headings_by_level(self, level):
return ['SYNOPSIS', 'DESCRIPTION']
doc = MockDocument()
result = validator.check(doc)
# Should pass validation (warnings don't fail)
assert result.is_valid()
assert not result.has_errors()
assert result.has_warnings()
assert len(result.get_warnings()) == 1
warning = result.get_warnings()[0]
assert warning.section_name == 'EXAMPLES'
assert warning.severity == 'WARNING'
def test_all_required_sections_present(self):
"""Test that validation passes when all required sections present."""
schema = {
'x-markitect-sections': {
'SYNOPSIS': {
'classification': 'required',
'heading_level': 2
},
'DESCRIPTION': {
'classification': 'required',
'heading_level': 2
}
}
}
validator = SectionValidator(schema)
# Create a mock document with all required sections
class MockDocument:
def get_headings_by_level(self, level):
return [
{'content': 'SYNOPSIS', 'level': 2},
{'content': 'DESCRIPTION', 'level': 2},
{'content': 'EXAMPLES', 'level': 2}
]
doc = MockDocument()
result = validator.check(doc)
# Should pass
assert result.is_valid()
assert not result.has_errors()
assert not result.has_warnings()
assert len(result.issues) == 0
def test_section_alternatives(self):
"""Test that alternative section names are recognized."""
schema = {
'x-markitect-sections': {
'OPTIONS': {
'classification': 'required',
'heading_level': 2,
'alternatives': ['FLAGS', 'COMMAND OPTIONS']
}
}
}
validator = SectionValidator(schema)
# Document uses alternative name 'FLAGS'
class MockDocument:
def get_headings_by_level(self, level):
return [{'content': 'FLAGS', 'level': 2}]
doc = MockDocument()
result = validator.check(doc)
# Should pass (alternative is accepted)
assert result.is_valid()
assert not result.has_errors()
class TestSemanticValidator:
"""Test complete semantic validation."""
def test_validator_initialization(self):
"""Test that validator initializes correctly."""
schema = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'x-markitect-sections': {
'SYNOPSIS': {'classification': 'required', 'heading_level': 2}
}
}
validator = SemanticValidator(schema)
assert validator.schema == schema
assert validator.section_validator is not None
def test_validation_report_formatting(self):
"""Test that validation reports format correctly."""
from markitect.validators.section_validator import (
SectionValidationResult,
SectionMissing
)
section_result = SectionValidationResult(
issues=[
SectionMissing(
section_name='SYNOPSIS',
severity='ERROR',
message='SYNOPSIS is required',
classification='required'
)
],
sections_checked=2,
sections_found=1
)
report = SemanticValidationReport(section_result=section_result)
# Check report properties
assert report.has_errors()
assert not report.is_valid()
# Check text formatting
text = report.format_text()
assert 'Section Validation:' in text
assert 'SYNOPSIS' in text
assert 'Errors: 1' in text
assert 'FAILED' in text
def test_load_json_schema(self, tmp_path):
"""Test loading a JSON schema file."""
schema_file = tmp_path / "test-schema.json"
schema_data = {
'$schema': 'http://json-schema.org/draft-07/schema#',
'title': 'Test Schema',
'x-markitect-sections': {
'SYNOPSIS': {'classification': 'required', 'heading_level': 2}
}
}
schema_file.write_text(json.dumps(schema_data, indent=2))
loaded_schema = load_schema_from_path(schema_file)
assert loaded_schema == schema_data
assert 'x-markitect-sections' in loaded_schema
def test_schema_not_found(self):
"""Test that missing schema file raises error."""
with pytest.raises(FileNotFoundError):
load_schema_from_path('/nonexistent/schema.json')
def test_unsupported_schema_format(self, tmp_path):
"""Test that unsupported format raises error."""
schema_file = tmp_path / "schema.xml"
schema_file.write_text('<schema></schema>')
with pytest.raises(ValueError, match="Unsupported schema format"):
load_schema_from_path(schema_file)
class TestContentValidator:
"""Test content validation functionality."""
def test_required_pattern_missing(self):
"""Test that missing required patterns are detected."""
schema = {
'x-markitect-content-control': {
'synopsis': {
'required_patterns': [
r'\*\*[a-z][a-z0-9-]*\*\*' # Bold command name
]
}
}
}
validator = ContentValidator(schema)
# Create mock document without bold command
class MockDocument:
def get_section(self, name):
if name == 'SYNOPSIS':
return {
'name': 'SYNOPSIS',
'content': 'command [options] arguments' # No bold
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should have one error
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, PatternMissing)
assert error.section_name == 'SYNOPSIS'
assert error.severity == 'ERROR'
def test_forbidden_pattern_found(self):
"""Test that forbidden patterns are detected."""
schema = {
'x-markitect-content-control': {
'description': {
'forbidden_patterns': [
r'\bTODO\b',
r'\bFIXME\b'
]
}
}
}
validator = ContentValidator(schema)
# Create mock document with forbidden pattern
class MockDocument:
def get_section(self, name):
if name == 'DESCRIPTION':
return {
'name': 'DESCRIPTION',
'content': 'This is a description. TODO: Add more details.'
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should have one error
assert not result.is_valid()
assert result.has_errors()
assert len(result.get_errors()) == 1
error = result.get_errors()[0]
assert isinstance(error, ForbiddenPattern)
assert error.section_name == 'DESCRIPTION'
assert 'TODO' in error.matched_text
def test_discouraged_pattern_warning(self):
"""Test that discouraged patterns generate warnings."""
schema = {
'x-markitect-content-control': {
'description': {
'discouraged_patterns': [
r'\bWIP\b'
]
}
}
}
validator = ContentValidator(schema)
# Create mock document with discouraged pattern
class MockDocument:
def get_section(self, name):
if name == 'DESCRIPTION':
return {
'name': 'DESCRIPTION',
'content': 'This is WIP content.'
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should pass (warnings don't fail)
assert result.is_valid()
assert not result.has_errors()
assert result.has_warnings()
warning = result.get_warnings()[0]
assert isinstance(warning, DiscouragedPattern)
assert warning.severity == 'WARNING'
def test_content_too_short(self):
"""Test word count validation - too short."""
schema = {
'x-markitect-content-control': {
'description': {
'content_quality': {
'min_words': 50,
'max_words': 1000
}
}
}
}
validator = ContentValidator(schema)
# Create mock document with short content
class MockDocument:
def get_section(self, name):
if name == 'DESCRIPTION':
return {
'name': 'DESCRIPTION',
'content': 'Short description.' # Only 2 words
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should have warning
assert result.is_valid() # Warnings don't fail
assert result.has_warnings()
warning = result.get_warnings()[0]
assert isinstance(warning, ContentTooShort)
assert warning.actual == 2
assert warning.required == 50
def test_content_too_long(self):
"""Test word count validation - too long."""
schema = {
'x-markitect-content-control': {
'synopsis': {
'content_quality': {
'min_words': 5,
'max_words': 20
}
}
}
}
validator = ContentValidator(schema)
# Create mock document with long content
class MockDocument:
def get_section(self, name):
if name == 'SYNOPSIS':
return {
'name': 'SYNOPSIS',
'content': ' '.join(['word'] * 50) # 50 words
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should have warning
assert result.is_valid()
assert result.has_warnings()
warning = result.get_warnings()[0]
assert isinstance(warning, ContentTooLong)
assert warning.actual == 50
assert warning.limit == 20
def test_all_content_requirements_met(self):
"""Test that validation passes when all requirements met."""
schema = {
'x-markitect-content-control': {
'synopsis': {
'required_patterns': [
r'\*\*[a-z]+\*\*'
],
'content_quality': {
'min_words': 5,
'max_words': 50
}
}
}
}
validator = ContentValidator(schema)
# Create valid document
class MockDocument:
def get_section(self, name):
if name == 'SYNOPSIS':
return {
'name': 'SYNOPSIS',
'content': '**command** [options] arguments and more words here'
}
return None
doc = MockDocument()
result = validator.check(doc)
# Should pass
assert result.is_valid()
assert not result.has_errors()
assert not result.has_warnings()
assert len(result.issues) == 0