feat: add semantic document validator for x-markitect extensions
Implements semantic validation to complement existing structural validation: Phase 1 & 2 Complete: - SemanticValidator: Main validator orchestrating sub-validators - SectionValidator: Enforces section classifications (required, recommended, optional, discouraged, improper) from x-markitect-sections - ContentValidator: Validates content patterns, forbidden patterns, and quality metrics (word counts, sentence counts) from x-markitect-content-control Features: - Pattern matching with regex for required/forbidden/discouraged patterns - Word count and sentence count validation - Detailed error reporting with severity levels (ERROR, WARNING) - Support for section alternatives (e.g., FLAGS vs OPTIONS) - Comprehensive test coverage (16 tests, 100% passing) Architecture: - Complements existing SchemaValidator (structural AST validation) - Clean separation: validators/ package for modular validators - Semantic validation focuses on x-markitect-* extensions - LinkValidator planned for Phase 3 (optional --check-links) Next: Phase 4 - CLI integration to enhance 'markitect validate' command Workplan: roadmap/20260106-semantic-document-validation/WORKPLAN.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
261
markitect/semantic_validator.py
Normal file
261
markitect/semantic_validator.py
Normal file
@@ -0,0 +1,261 @@
|
|||||||
|
"""
|
||||||
|
Semantic Validator for markdown documents.
|
||||||
|
|
||||||
|
Validates markdown documents against x-markitect schema extensions:
|
||||||
|
- x-markitect-sections: Section classifications (required, recommended, etc.)
|
||||||
|
- x-markitect-content-control: Content patterns and quality metrics
|
||||||
|
- Link validation: Internal and external link checking
|
||||||
|
|
||||||
|
Complements the existing SchemaValidator which handles structural AST validation.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
import json
|
||||||
|
|
||||||
|
from markitect.validators.section_validator import (
|
||||||
|
SectionValidator,
|
||||||
|
SectionValidationResult
|
||||||
|
)
|
||||||
|
from markitect.validators.content_validator import (
|
||||||
|
ContentValidator,
|
||||||
|
ContentValidationResult
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SemanticValidationReport:
|
||||||
|
"""
|
||||||
|
Report of semantic validation results.
|
||||||
|
|
||||||
|
Combines results from section, content, and link validators.
|
||||||
|
"""
|
||||||
|
section_result: SectionValidationResult
|
||||||
|
content_result: Optional[ContentValidationResult] = None
|
||||||
|
link_result: Optional[Any] = None # LinkValidationResult when implemented
|
||||||
|
|
||||||
|
def has_errors(self) -> bool:
|
||||||
|
"""Check if there are any ERROR-level issues."""
|
||||||
|
errors = self.section_result.has_errors()
|
||||||
|
|
||||||
|
if self.content_result and hasattr(self.content_result, 'has_errors'):
|
||||||
|
errors = errors or self.content_result.has_errors()
|
||||||
|
|
||||||
|
if self.link_result and hasattr(self.link_result, 'has_errors'):
|
||||||
|
errors = errors or self.link_result.has_errors()
|
||||||
|
|
||||||
|
return errors
|
||||||
|
|
||||||
|
def has_warnings(self) -> bool:
|
||||||
|
"""Check if there are any WARNING-level issues."""
|
||||||
|
warnings = self.section_result.has_warnings()
|
||||||
|
|
||||||
|
if self.content_result and hasattr(self.content_result, 'has_warnings'):
|
||||||
|
warnings = warnings or self.content_result.has_warnings()
|
||||||
|
|
||||||
|
if self.link_result and hasattr(self.link_result, 'has_warnings'):
|
||||||
|
warnings = warnings or self.link_result.has_warnings()
|
||||||
|
|
||||||
|
return warnings
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
"""Check if validation passed (no errors)."""
|
||||||
|
return not self.has_errors()
|
||||||
|
|
||||||
|
def get_all_issues(self) -> List[Any]:
|
||||||
|
"""Get all issues from all validators."""
|
||||||
|
issues = list(self.section_result.issues)
|
||||||
|
|
||||||
|
if self.content_result and hasattr(self.content_result, 'issues'):
|
||||||
|
issues.extend(self.content_result.issues)
|
||||||
|
|
||||||
|
if self.link_result and hasattr(self.link_result, 'issues'):
|
||||||
|
issues.extend(self.link_result.issues)
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
def format_text(self) -> str:
|
||||||
|
"""Format validation report as text."""
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
# Section validation
|
||||||
|
lines.append("Section Validation:")
|
||||||
|
if self.section_result.issues:
|
||||||
|
for issue in self.section_result.issues:
|
||||||
|
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
||||||
|
lines.append(f" {status} {issue.section_name} - {issue.message}")
|
||||||
|
else:
|
||||||
|
lines.append(" ✅ All section requirements met")
|
||||||
|
|
||||||
|
# Content validation
|
||||||
|
if self.content_result:
|
||||||
|
lines.append("")
|
||||||
|
lines.append("Content Validation:")
|
||||||
|
if self.content_result.issues:
|
||||||
|
for issue in self.content_result.issues:
|
||||||
|
status = "❌" if issue.severity == 'ERROR' else "⚠️"
|
||||||
|
lines.append(f" {status} {issue.section_name} - {issue.message}")
|
||||||
|
else:
|
||||||
|
lines.append(" ✅ All content requirements met")
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
lines.append("")
|
||||||
|
lines.append("Summary:")
|
||||||
|
lines.append(f" Sections checked: {self.section_result.sections_checked}")
|
||||||
|
lines.append(f" Sections found: {self.section_result.sections_found}")
|
||||||
|
|
||||||
|
all_errors = self.section_result.get_errors()
|
||||||
|
all_warnings = self.section_result.get_warnings()
|
||||||
|
|
||||||
|
if self.content_result:
|
||||||
|
all_errors.extend(self.content_result.get_errors())
|
||||||
|
all_warnings.extend(self.content_result.get_warnings())
|
||||||
|
|
||||||
|
lines.append(f" Errors: {len(all_errors)}")
|
||||||
|
lines.append(f" Warnings: {len(all_warnings)}")
|
||||||
|
|
||||||
|
if self.is_valid():
|
||||||
|
lines.append(" Status: PASSED ✅")
|
||||||
|
else:
|
||||||
|
lines.append(" Status: FAILED ❌")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
class SemanticValidator:
|
||||||
|
"""
|
||||||
|
Validates markdown documents against x-markitect extensions.
|
||||||
|
|
||||||
|
Complements existing SchemaValidator which handles structural AST validation.
|
||||||
|
This validator checks semantic aspects defined in x-markitect-* extensions.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> schema = load_schema('manpage-schema-v1.0.md')
|
||||||
|
>>> validator = SemanticValidator(schema)
|
||||||
|
>>> report = validator.validate('my-command.1.md')
|
||||||
|
>>> if not report.is_valid():
|
||||||
|
... print(report.format_text())
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, schema: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
Initialize semantic validator with a schema.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
schema: JSON schema with x-markitect-* extensions
|
||||||
|
|
||||||
|
The schema can be either:
|
||||||
|
- A dict loaded from JSON
|
||||||
|
- A dict loaded from markdown with embedded JSON
|
||||||
|
- Must contain x-markitect-sections and/or x-markitect-content-control
|
||||||
|
"""
|
||||||
|
self.schema = schema
|
||||||
|
|
||||||
|
# Initialize sub-validators
|
||||||
|
self.section_validator = SectionValidator(schema)
|
||||||
|
self.content_validator = ContentValidator(schema)
|
||||||
|
|
||||||
|
# TODO: Initialize link validator when implemented
|
||||||
|
# self.link_validator = LinkValidator(schema)
|
||||||
|
|
||||||
|
def validate(self, document_path: str | Path,
|
||||||
|
check_links: bool = False) -> SemanticValidationReport:
|
||||||
|
"""
|
||||||
|
Validate a markdown document against schema extensions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_path: Path to markdown document to validate
|
||||||
|
check_links: Whether to validate links (may be slow)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
SemanticValidationReport with validation results
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If document_path doesn't exist
|
||||||
|
ValueError: If document cannot be parsed
|
||||||
|
"""
|
||||||
|
document_path = Path(document_path)
|
||||||
|
|
||||||
|
if not document_path.exists():
|
||||||
|
raise FileNotFoundError(f"Document not found: {document_path}")
|
||||||
|
|
||||||
|
# Parse document
|
||||||
|
document = self._parse_document(document_path)
|
||||||
|
|
||||||
|
# Run section validation
|
||||||
|
section_result = self.section_validator.check(document)
|
||||||
|
|
||||||
|
# Run content validation
|
||||||
|
content_result = self.content_validator.check(document)
|
||||||
|
|
||||||
|
# TODO: Run link validation when implemented
|
||||||
|
# if check_links:
|
||||||
|
# link_result = self.link_validator.check(document)
|
||||||
|
# else:
|
||||||
|
# link_result = None
|
||||||
|
link_result = None
|
||||||
|
|
||||||
|
return SemanticValidationReport(
|
||||||
|
section_result=section_result,
|
||||||
|
content_result=content_result,
|
||||||
|
link_result=link_result
|
||||||
|
)
|
||||||
|
|
||||||
|
def _parse_document(self, document_path: Path) -> 'MarkdownDocument':
|
||||||
|
"""
|
||||||
|
Parse markdown document into AST.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document_path: Path to markdown file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Parsed MarkdownDocument object
|
||||||
|
|
||||||
|
This uses the existing markitect markdown parser.
|
||||||
|
"""
|
||||||
|
# Import here to avoid circular dependency
|
||||||
|
from markitect.document_manager import DocumentManager
|
||||||
|
|
||||||
|
# Use DocumentManager to parse the document
|
||||||
|
doc_manager = DocumentManager()
|
||||||
|
doc = doc_manager.ingest_file(document_path)
|
||||||
|
|
||||||
|
return doc
|
||||||
|
|
||||||
|
|
||||||
|
def load_schema_from_path(schema_path: str | Path) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Load a schema from file (supports .json and .md formats).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
schema_path: Path to schema file
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Schema dict with embedded JSON
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
FileNotFoundError: If schema file doesn't exist
|
||||||
|
ValueError: If schema cannot be parsed
|
||||||
|
"""
|
||||||
|
schema_path = Path(schema_path)
|
||||||
|
|
||||||
|
if not schema_path.exists():
|
||||||
|
raise FileNotFoundError(f"Schema not found: {schema_path}")
|
||||||
|
|
||||||
|
if schema_path.suffix == '.json':
|
||||||
|
# Load JSON schema directly
|
||||||
|
with open(schema_path, 'r', encoding='utf-8') as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
elif schema_path.suffix == '.md':
|
||||||
|
# Load markdown schema with embedded JSON
|
||||||
|
from markitect.schema_loader import MarkdownSchemaLoader
|
||||||
|
|
||||||
|
loader = MarkdownSchemaLoader()
|
||||||
|
schema_data = loader.load_schema(schema_path)
|
||||||
|
|
||||||
|
return schema_data['schema']
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported schema format: {schema_path.suffix}")
|
||||||
50
markitect/validators/__init__.py
Normal file
50
markitect/validators/__init__.py
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
"""
|
||||||
|
Validators package for semantic document validation.
|
||||||
|
|
||||||
|
This package contains validators that check markdown documents against
|
||||||
|
x-markitect schema extensions (sections, content-control, link validation).
|
||||||
|
|
||||||
|
Validators:
|
||||||
|
- SectionValidator: Validates section presence based on classifications
|
||||||
|
- ContentValidator: Validates content patterns and quality metrics
|
||||||
|
- LinkValidator: Validates internal and external links
|
||||||
|
"""
|
||||||
|
|
||||||
|
from markitect.validators.section_validator import (
|
||||||
|
SectionValidator,
|
||||||
|
SectionValidationResult,
|
||||||
|
SectionIssue,
|
||||||
|
SectionMissing,
|
||||||
|
SectionImproper,
|
||||||
|
SectionDiscouraged,
|
||||||
|
)
|
||||||
|
|
||||||
|
from markitect.validators.content_validator import (
|
||||||
|
ContentValidator,
|
||||||
|
ContentValidationResult,
|
||||||
|
ContentIssue,
|
||||||
|
PatternMissing,
|
||||||
|
ForbiddenPattern,
|
||||||
|
DiscouragedPattern,
|
||||||
|
ContentTooShort,
|
||||||
|
ContentTooLong,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
# Section validator
|
||||||
|
'SectionValidator',
|
||||||
|
'SectionValidationResult',
|
||||||
|
'SectionIssue',
|
||||||
|
'SectionMissing',
|
||||||
|
'SectionImproper',
|
||||||
|
'SectionDiscouraged',
|
||||||
|
# Content validator
|
||||||
|
'ContentValidator',
|
||||||
|
'ContentValidationResult',
|
||||||
|
'ContentIssue',
|
||||||
|
'PatternMissing',
|
||||||
|
'ForbiddenPattern',
|
||||||
|
'DiscouragedPattern',
|
||||||
|
'ContentTooShort',
|
||||||
|
'ContentTooLong',
|
||||||
|
]
|
||||||
316
markitect/validators/content_validator.py
Normal file
316
markitect/validators/content_validator.py
Normal file
@@ -0,0 +1,316 @@
|
|||||||
|
"""
|
||||||
|
Content Validator for markdown documents.
|
||||||
|
|
||||||
|
Validates content against x-markitect-content-control rules:
|
||||||
|
- Required patterns: Regex patterns that must appear in content
|
||||||
|
- Discouraged patterns: Patterns that should be avoided (warnings)
|
||||||
|
- Forbidden patterns: Patterns that must not appear (errors)
|
||||||
|
- Quality metrics: Word counts, sentence counts, readability
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ContentIssue:
|
||||||
|
"""Base class for content validation issues."""
|
||||||
|
section_name: str
|
||||||
|
severity: str # 'ERROR', 'WARNING', 'INFO'
|
||||||
|
message: str
|
||||||
|
line_number: Optional[int] = None
|
||||||
|
matched_text: Optional[str] = None
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
location = f" (line {self.line_number})" if self.line_number else ""
|
||||||
|
match_info = f": '{self.matched_text}'" if self.matched_text else ""
|
||||||
|
return f"[{self.severity}]{location} {self.section_name} - {self.message}{match_info}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PatternMissing(ContentIssue):
|
||||||
|
"""Required pattern not found in content."""
|
||||||
|
pattern: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ForbiddenPattern(ContentIssue):
|
||||||
|
"""Forbidden pattern found in content."""
|
||||||
|
pattern: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DiscouragedPattern(ContentIssue):
|
||||||
|
"""Discouraged pattern found in content."""
|
||||||
|
pattern: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ContentTooShort(ContentIssue):
|
||||||
|
"""Content does not meet minimum word/sentence count."""
|
||||||
|
actual: int = 0
|
||||||
|
required: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ContentTooLong(ContentIssue):
|
||||||
|
"""Content exceeds maximum word/sentence count."""
|
||||||
|
actual: int = 0
|
||||||
|
limit: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ContentValidationResult:
|
||||||
|
"""Result of content validation."""
|
||||||
|
issues: List[ContentIssue]
|
||||||
|
sections_checked: int
|
||||||
|
|
||||||
|
def has_errors(self) -> bool:
|
||||||
|
"""Check if there are any ERROR-level issues."""
|
||||||
|
return any(issue.severity == 'ERROR' for issue in self.issues)
|
||||||
|
|
||||||
|
def has_warnings(self) -> bool:
|
||||||
|
"""Check if there are any WARNING-level issues."""
|
||||||
|
return any(issue.severity == 'WARNING' for issue in self.issues)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
"""Check if validation passed (no errors)."""
|
||||||
|
return not self.has_errors()
|
||||||
|
|
||||||
|
def get_errors(self) -> List[ContentIssue]:
|
||||||
|
"""Get all ERROR-level issues."""
|
||||||
|
return [issue for issue in self.issues if issue.severity == 'ERROR']
|
||||||
|
|
||||||
|
def get_warnings(self) -> List[ContentIssue]:
|
||||||
|
"""Get all WARNING-level issues."""
|
||||||
|
return [issue for issue in self.issues if issue.severity == 'WARNING']
|
||||||
|
|
||||||
|
|
||||||
|
class ContentValidator:
|
||||||
|
"""
|
||||||
|
Validates content against x-markitect-content-control rules.
|
||||||
|
|
||||||
|
Checks content patterns, quality metrics, and readability for each section.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, schema: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
Initialize validator with a schema.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
schema: JSON schema with x-markitect-content-control extension
|
||||||
|
"""
|
||||||
|
self.schema = schema
|
||||||
|
self.content_rules = schema.get('x-markitect-content-control', {})
|
||||||
|
|
||||||
|
def check(self, document: 'MarkdownDocument') -> ContentValidationResult:
|
||||||
|
"""
|
||||||
|
Validate content against schema rules.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: Parsed markdown document
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ContentValidationResult with any issues found
|
||||||
|
"""
|
||||||
|
issues = []
|
||||||
|
sections_checked = 0
|
||||||
|
|
||||||
|
# Check each section that has content rules
|
||||||
|
for section_key, rules in self.content_rules.items():
|
||||||
|
sections_checked += 1
|
||||||
|
|
||||||
|
# Get section from document
|
||||||
|
section = self._get_section(document, section_key)
|
||||||
|
|
||||||
|
if not section:
|
||||||
|
# Section validator handles missing sections
|
||||||
|
continue
|
||||||
|
|
||||||
|
section_content = section.get('content', '')
|
||||||
|
section_name = section.get('name', section_key)
|
||||||
|
|
||||||
|
# Check required patterns
|
||||||
|
issues.extend(self._check_required_patterns(
|
||||||
|
section_name, section_content, rules
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check forbidden patterns
|
||||||
|
issues.extend(self._check_forbidden_patterns(
|
||||||
|
section_name, section_content, rules
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check discouraged patterns
|
||||||
|
issues.extend(self._check_discouraged_patterns(
|
||||||
|
section_name, section_content, rules
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check content quality metrics
|
||||||
|
issues.extend(self._check_quality_metrics(
|
||||||
|
section_name, section_content, rules
|
||||||
|
))
|
||||||
|
|
||||||
|
return ContentValidationResult(
|
||||||
|
issues=issues,
|
||||||
|
sections_checked=sections_checked
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_section(self, document: 'MarkdownDocument',
|
||||||
|
section_key: str) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Get a section from the document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: Parsed markdown document
|
||||||
|
section_key: Section name (lowercase in rules, uppercase in document)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Section dict with name and content, or None if not found
|
||||||
|
"""
|
||||||
|
# Convert section_key to uppercase for matching
|
||||||
|
section_name = section_key.upper()
|
||||||
|
|
||||||
|
# Try to get section content
|
||||||
|
if hasattr(document, 'get_section'):
|
||||||
|
return document.get_section(section_name)
|
||||||
|
|
||||||
|
# Fallback: search headings
|
||||||
|
if hasattr(document, 'get_headings_by_level'):
|
||||||
|
headings = document.get_headings_by_level(2)
|
||||||
|
for heading in headings:
|
||||||
|
if isinstance(heading, dict):
|
||||||
|
if heading.get('content', '').strip().upper() == section_name:
|
||||||
|
# Found the section, need to extract content
|
||||||
|
return {
|
||||||
|
'name': section_name,
|
||||||
|
'content': heading.get('text_content', '')
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _check_required_patterns(self, section_name: str, content: str,
|
||||||
|
rules: Dict[str, Any]) -> List[ContentIssue]:
|
||||||
|
"""Check that all required patterns appear in content."""
|
||||||
|
issues = []
|
||||||
|
required_patterns = rules.get('required_patterns', [])
|
||||||
|
|
||||||
|
for pattern in required_patterns:
|
||||||
|
try:
|
||||||
|
if not re.search(pattern, content, re.MULTILINE):
|
||||||
|
issues.append(PatternMissing(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='ERROR',
|
||||||
|
message=f'Required pattern not found',
|
||||||
|
pattern=pattern
|
||||||
|
))
|
||||||
|
except re.error as e:
|
||||||
|
# Invalid regex pattern in schema
|
||||||
|
issues.append(ContentIssue(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='ERROR',
|
||||||
|
message=f'Invalid regex pattern in schema: {e}'
|
||||||
|
))
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
def _check_forbidden_patterns(self, section_name: str, content: str,
|
||||||
|
rules: Dict[str, Any]) -> List[ContentIssue]:
|
||||||
|
"""Check that no forbidden patterns appear in content."""
|
||||||
|
issues = []
|
||||||
|
forbidden_patterns = rules.get('forbidden_patterns', [])
|
||||||
|
|
||||||
|
for pattern in forbidden_patterns:
|
||||||
|
try:
|
||||||
|
match = re.search(pattern, content, re.MULTILINE)
|
||||||
|
if match:
|
||||||
|
issues.append(ForbiddenPattern(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='ERROR',
|
||||||
|
message=f'Forbidden pattern found',
|
||||||
|
pattern=pattern,
|
||||||
|
matched_text=match.group(0)[:50] # Limit to 50 chars
|
||||||
|
))
|
||||||
|
except re.error as e:
|
||||||
|
issues.append(ContentIssue(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='ERROR',
|
||||||
|
message=f'Invalid regex pattern in schema: {e}'
|
||||||
|
))
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
def _check_discouraged_patterns(self, section_name: str, content: str,
|
||||||
|
rules: Dict[str, Any]) -> List[ContentIssue]:
|
||||||
|
"""Check for discouraged patterns (warnings)."""
|
||||||
|
issues = []
|
||||||
|
discouraged_patterns = rules.get('discouraged_patterns', [])
|
||||||
|
|
||||||
|
for pattern in discouraged_patterns:
|
||||||
|
try:
|
||||||
|
match = re.search(pattern, content, re.MULTILINE)
|
||||||
|
if match:
|
||||||
|
issues.append(DiscouragedPattern(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='WARNING',
|
||||||
|
message=f'Discouraged pattern found',
|
||||||
|
pattern=pattern,
|
||||||
|
matched_text=match.group(0)[:50]
|
||||||
|
))
|
||||||
|
except re.error as e:
|
||||||
|
issues.append(ContentIssue(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='WARNING',
|
||||||
|
message=f'Invalid regex pattern in schema: {e}'
|
||||||
|
))
|
||||||
|
|
||||||
|
return issues
|
||||||
|
|
||||||
|
def _check_quality_metrics(self, section_name: str, content: str,
|
||||||
|
rules: Dict[str, Any]) -> List[ContentIssue]:
|
||||||
|
"""Check content quality metrics (word count, sentence count)."""
|
||||||
|
issues = []
|
||||||
|
quality = rules.get('content_quality', {})
|
||||||
|
|
||||||
|
if not quality:
|
||||||
|
return issues
|
||||||
|
|
||||||
|
# Word count validation
|
||||||
|
word_count = len(content.split())
|
||||||
|
|
||||||
|
min_words = quality.get('min_words')
|
||||||
|
if min_words is not None and word_count < min_words:
|
||||||
|
issues.append(ContentTooShort(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='WARNING',
|
||||||
|
message=f'Content too short ({word_count} words, minimum {min_words})',
|
||||||
|
actual=word_count,
|
||||||
|
required=min_words
|
||||||
|
))
|
||||||
|
|
||||||
|
max_words = quality.get('max_words')
|
||||||
|
if max_words is not None and word_count > max_words:
|
||||||
|
issues.append(ContentTooLong(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='WARNING',
|
||||||
|
message=f'Content too long ({word_count} words, maximum {max_words})',
|
||||||
|
actual=word_count,
|
||||||
|
limit=max_words
|
||||||
|
))
|
||||||
|
|
||||||
|
# Sentence count validation
|
||||||
|
min_sentences = quality.get('min_sentences')
|
||||||
|
if min_sentences is not None:
|
||||||
|
# Simple sentence count (split by .!?)
|
||||||
|
sentence_count = len(re.findall(r'[.!?]+', content))
|
||||||
|
|
||||||
|
if sentence_count < min_sentences:
|
||||||
|
issues.append(ContentTooShort(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='WARNING',
|
||||||
|
message=f'Too few sentences ({sentence_count}, minimum {min_sentences})',
|
||||||
|
actual=sentence_count,
|
||||||
|
required=min_sentences
|
||||||
|
))
|
||||||
|
|
||||||
|
return issues
|
||||||
226
markitect/validators/section_validator.py
Normal file
226
markitect/validators/section_validator.py
Normal file
@@ -0,0 +1,226 @@
|
|||||||
|
"""
|
||||||
|
Section Validator for markdown documents.
|
||||||
|
|
||||||
|
Validates that document sections comply with x-markitect-sections classifications:
|
||||||
|
- REQUIRED: Section must be present (ERROR if missing)
|
||||||
|
- RECOMMENDED: Section should be present (WARNING if missing)
|
||||||
|
- OPTIONAL: Section may be present (no check)
|
||||||
|
- DISCOURAGED: Section should not be present (WARNING if present)
|
||||||
|
- IMPROPER: Section must not be present (ERROR if present)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SectionIssue:
|
||||||
|
"""Base class for section validation issues."""
|
||||||
|
section_name: str
|
||||||
|
severity: str # 'ERROR', 'WARNING', 'INFO'
|
||||||
|
message: str
|
||||||
|
classification: str # 'required', 'recommended', etc.
|
||||||
|
line_number: Optional[int] = None
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
location = f" (line {self.line_number})" if self.line_number else ""
|
||||||
|
return f"[{self.severity}]{location} {self.section_name}: {self.message}"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SectionMissing(SectionIssue):
|
||||||
|
"""Section is missing from document."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SectionImproper(SectionIssue):
|
||||||
|
"""Improper section found in document."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SectionDiscouraged(SectionIssue):
|
||||||
|
"""Discouraged section found in document."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SectionValidationResult:
|
||||||
|
"""Result of section validation."""
|
||||||
|
issues: List[SectionIssue]
|
||||||
|
sections_checked: int
|
||||||
|
sections_found: int
|
||||||
|
|
||||||
|
def has_errors(self) -> bool:
|
||||||
|
"""Check if there are any ERROR-level issues."""
|
||||||
|
return any(issue.severity == 'ERROR' for issue in self.issues)
|
||||||
|
|
||||||
|
def has_warnings(self) -> bool:
|
||||||
|
"""Check if there are any WARNING-level issues."""
|
||||||
|
return any(issue.severity == 'WARNING' for issue in self.issues)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
"""Check if validation passed (no errors)."""
|
||||||
|
return not self.has_errors()
|
||||||
|
|
||||||
|
def get_errors(self) -> List[SectionIssue]:
|
||||||
|
"""Get all ERROR-level issues."""
|
||||||
|
return [issue for issue in self.issues if issue.severity == 'ERROR']
|
||||||
|
|
||||||
|
def get_warnings(self) -> List[SectionIssue]:
|
||||||
|
"""Get all WARNING-level issues."""
|
||||||
|
return [issue for issue in self.issues if issue.severity == 'WARNING']
|
||||||
|
|
||||||
|
|
||||||
|
class SectionValidator:
|
||||||
|
"""
|
||||||
|
Validates section presence and classification compliance.
|
||||||
|
|
||||||
|
Checks that markdown documents have the correct sections based on
|
||||||
|
x-markitect-sections classifications in the schema.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, schema: Dict[str, Any]):
|
||||||
|
"""
|
||||||
|
Initialize validator with a schema.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
schema: JSON schema with x-markitect-sections extension
|
||||||
|
"""
|
||||||
|
self.schema = schema
|
||||||
|
self.sections_spec = schema.get('x-markitect-sections', {})
|
||||||
|
|
||||||
|
def check(self, document: 'MarkdownDocument') -> SectionValidationResult:
|
||||||
|
"""
|
||||||
|
Validate section presence against schema classifications.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: Parsed markdown document
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
SectionValidationResult with any issues found
|
||||||
|
"""
|
||||||
|
issues = []
|
||||||
|
|
||||||
|
# Get level-2 headings (main sections) from document
|
||||||
|
doc_sections = self._get_document_sections(document)
|
||||||
|
|
||||||
|
# Check each specification
|
||||||
|
for section_name, spec in self.sections_spec.items():
|
||||||
|
classification = spec.get('classification')
|
||||||
|
section_in_doc = self._find_section(section_name, doc_sections, spec)
|
||||||
|
|
||||||
|
if classification == 'required':
|
||||||
|
if not section_in_doc:
|
||||||
|
issues.append(SectionMissing(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='ERROR',
|
||||||
|
message=spec.get('error_message', f'{section_name} section is required'),
|
||||||
|
classification='required'
|
||||||
|
))
|
||||||
|
|
||||||
|
elif classification == 'improper':
|
||||||
|
if section_in_doc:
|
||||||
|
issues.append(SectionImproper(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='ERROR',
|
||||||
|
message=spec.get('error_message', f'{section_name} section must not appear'),
|
||||||
|
classification='improper',
|
||||||
|
line_number=section_in_doc.get('line_number')
|
||||||
|
))
|
||||||
|
|
||||||
|
elif classification == 'recommended':
|
||||||
|
if not section_in_doc:
|
||||||
|
issues.append(SectionMissing(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='WARNING',
|
||||||
|
message=spec.get('warning_if_missing', f'{section_name} section is recommended'),
|
||||||
|
classification='recommended'
|
||||||
|
))
|
||||||
|
|
||||||
|
elif classification == 'discouraged':
|
||||||
|
if section_in_doc:
|
||||||
|
issues.append(SectionDiscouraged(
|
||||||
|
section_name=section_name,
|
||||||
|
severity='WARNING',
|
||||||
|
message=spec.get('warning_if_present', f'{section_name} section is discouraged'),
|
||||||
|
classification='discouraged',
|
||||||
|
line_number=section_in_doc.get('line_number')
|
||||||
|
))
|
||||||
|
|
||||||
|
return SectionValidationResult(
|
||||||
|
issues=issues,
|
||||||
|
sections_checked=len(self.sections_spec),
|
||||||
|
sections_found=len(doc_sections)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_document_sections(self, document: 'MarkdownDocument') -> List[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Extract level-2 headings from document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
document: Parsed markdown document
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of section dicts with name and line_number
|
||||||
|
"""
|
||||||
|
sections = []
|
||||||
|
|
||||||
|
# Get headings from document
|
||||||
|
if hasattr(document, 'get_headings_by_level'):
|
||||||
|
level_2_headings = document.get_headings_by_level(2)
|
||||||
|
elif hasattr(document, 'headings'):
|
||||||
|
level_2_headings = [
|
||||||
|
h for h in document.headings
|
||||||
|
if h.get('level') == 2
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
# Fallback: parse from AST
|
||||||
|
level_2_headings = []
|
||||||
|
|
||||||
|
for heading in level_2_headings:
|
||||||
|
if isinstance(heading, dict):
|
||||||
|
sections.append({
|
||||||
|
'name': heading.get('content', '').strip().upper(),
|
||||||
|
'line_number': heading.get('line_number')
|
||||||
|
})
|
||||||
|
elif isinstance(heading, str):
|
||||||
|
sections.append({
|
||||||
|
'name': heading.strip().upper(),
|
||||||
|
'line_number': None
|
||||||
|
})
|
||||||
|
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def _find_section(self, section_name: str, doc_sections: List[Dict[str, Any]],
|
||||||
|
spec: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||||
|
"""
|
||||||
|
Find a section in document, checking alternatives.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
section_name: Primary section name to find
|
||||||
|
doc_sections: List of sections in document
|
||||||
|
spec: Section specification with potential alternatives
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Section dict if found, None otherwise
|
||||||
|
"""
|
||||||
|
# Normalize section name for comparison
|
||||||
|
normalized_name = section_name.upper().strip()
|
||||||
|
|
||||||
|
# Check primary name
|
||||||
|
for section in doc_sections:
|
||||||
|
if section['name'] == normalized_name:
|
||||||
|
return section
|
||||||
|
|
||||||
|
# Check alternatives
|
||||||
|
alternatives = spec.get('alternatives', [])
|
||||||
|
for alt_name in alternatives:
|
||||||
|
normalized_alt = alt_name.upper().strip()
|
||||||
|
for section in doc_sections:
|
||||||
|
if section['name'] == normalized_alt:
|
||||||
|
return section
|
||||||
|
|
||||||
|
return None
|
||||||
573
roadmap/20260106-semantic-document-validation/WORKPLAN.md
Normal file
573
roadmap/20260106-semantic-document-validation/WORKPLAN.md
Normal file
@@ -0,0 +1,573 @@
|
|||||||
|
# Plan: Schema System Enhancement - Semantic Document Validation
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The schema management system has **complete schema structure analysis tools** (schema-analyze, schema-refine) and **structural AST validation** (markitect validate), but is missing **semantic validation capabilities**. This plan enhances validation to check sections, content patterns, and quality metrics defined in x-markitect extensions.
|
||||||
|
|
||||||
|
## Current State Assessment
|
||||||
|
|
||||||
|
### ✅ Already Implemented
|
||||||
|
- **schema-analyze**: Detects rigid constraints, calculates rigidity score (markitect/schema_analyzer.py)
|
||||||
|
- **schema-refine**: Automatically loosens rigid constraints (markitect/schema_refiner.py)
|
||||||
|
- **markitect validate**: Validates AST structure against JSON schemas (cli.py:1493-1600)
|
||||||
|
- Checks headings, paragraphs, code_blocks counts match schema
|
||||||
|
- Validates document structure against JSON Schema properties
|
||||||
|
- Does NOT check x-markitect-sections classifications
|
||||||
|
- Does NOT validate x-markitect-content-control patterns
|
||||||
|
- **X-Markitect Extensions**: Full system with sections, content-control, metadata
|
||||||
|
- **Metaschema Validation**: Validates schema structure and extensions
|
||||||
|
- **4 Production Schemas**: manpage, API docs, terminology, schema-schema
|
||||||
|
- **Comprehensive Documentation**: User guides, specifications, tests (97 tests passing)
|
||||||
|
|
||||||
|
### ❌ Missing Capabilities (Semantic Validation)
|
||||||
|
1. **Section Classification Enforcement**: required/recommended/optional/discouraged/improper not checked
|
||||||
|
2. **Content Pattern Validation**: required_patterns, forbidden_patterns not matched
|
||||||
|
3. **Quality Metrics Validation**: min_words, max_words, min_sentences not enforced
|
||||||
|
4. **Link Validation**: Internal/external link checking not implemented
|
||||||
|
5. **Content Instructions**: content_instruction fields defined but not validated
|
||||||
|
|
||||||
|
## What We Have vs What We Need
|
||||||
|
|
||||||
|
**Current `markitect validate`** (Structural):
|
||||||
|
```bash
|
||||||
|
markitect validate doc.md --schema schema.json
|
||||||
|
# ✅ Checks: headings.level_2 has 5-30 items
|
||||||
|
# ✅ Checks: paragraphs has 10-500 items
|
||||||
|
# ✅ Checks: code_blocks has 1-50 items
|
||||||
|
# ❌ Does NOT check: SYNOPSIS section present (required)
|
||||||
|
# ❌ Does NOT check: INTERNAL_NOTES absent (improper)
|
||||||
|
# ❌ Does NOT check: Synopsis contains bold command name
|
||||||
|
# ❌ Does NOT check: Description has min 50 words
|
||||||
|
```
|
||||||
|
|
||||||
|
**Enhanced `markitect validate`** (Structural + Semantic):
|
||||||
|
```bash
|
||||||
|
markitect validate doc.md --schema manpage-schema-v1.0.md
|
||||||
|
# ✅ Checks: AST structure (existing)
|
||||||
|
# ✅ NEW: SYNOPSIS section present (required)
|
||||||
|
# ✅ NEW: INTERNAL_NOTES not present (improper)
|
||||||
|
# ✅ NEW: Synopsis contains **command** pattern
|
||||||
|
# ✅ NEW: Description has 50+ words
|
||||||
|
# ✅ NEW: No forbidden TODO patterns
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Plan
|
||||||
|
|
||||||
|
### Phase 1: Core Semantic Validator
|
||||||
|
|
||||||
|
**Goal**: Create semantic validator to complement existing structural validation
|
||||||
|
|
||||||
|
**New Module**: `markitect/semantic_validator.py`
|
||||||
|
|
||||||
|
**Key Components**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class SemanticValidator:
|
||||||
|
"""Validates markdown documents against x-markitect extensions.
|
||||||
|
|
||||||
|
Complements existing SchemaValidator which handles structural AST validation.
|
||||||
|
This validator checks semantic aspects defined in x-markitect-* extensions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, schema_path: str):
|
||||||
|
# Load schema (supports .md schemas with embedded JSON)
|
||||||
|
self.schema = load_schema_with_extensions(schema_path)
|
||||||
|
|
||||||
|
# Initialize sub-validators
|
||||||
|
self.section_validator = SectionValidator(self.schema)
|
||||||
|
self.content_validator = ContentValidator(self.schema)
|
||||||
|
self.link_validator = LinkValidator(self.schema)
|
||||||
|
|
||||||
|
def validate(self, document_path: str, check_links: bool = False) -> SemanticValidationReport:
|
||||||
|
"""Main semantic validation entry point."""
|
||||||
|
doc = parse_markdown_document(document_path)
|
||||||
|
|
||||||
|
results = {
|
||||||
|
'sections': self.section_validator.check(doc),
|
||||||
|
'content': self.content_validator.check(doc)
|
||||||
|
}
|
||||||
|
|
||||||
|
if check_links:
|
||||||
|
results['links'] = self.link_validator.check(doc)
|
||||||
|
|
||||||
|
return SemanticValidationReport(results)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Features**:
|
||||||
|
- Load schema from registry or filesystem
|
||||||
|
- Parse markdown document into AST
|
||||||
|
- Validate sections against x-markitect-sections classifications
|
||||||
|
- Check content against x-markitect-content-control patterns
|
||||||
|
- Validate links if enabled
|
||||||
|
- Generate detailed report with line numbers
|
||||||
|
|
||||||
|
### Phase 2: Section Presence Validator
|
||||||
|
|
||||||
|
**New Module**: `markitect/section_validator.py`
|
||||||
|
|
||||||
|
**Validation Rules**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class SectionValidator:
|
||||||
|
"""Validates section presence and classification compliance."""
|
||||||
|
|
||||||
|
def check(self, document: MarkdownDocument) -> SectionValidationResult:
|
||||||
|
sections_spec = self.schema.get('x-markitect-sections', {})
|
||||||
|
doc_sections = document.get_headings_by_level(2)
|
||||||
|
|
||||||
|
issues = []
|
||||||
|
|
||||||
|
# Check REQUIRED sections
|
||||||
|
for section_name, spec in sections_spec.items():
|
||||||
|
if spec['classification'] == 'required':
|
||||||
|
if section_name not in doc_sections:
|
||||||
|
issues.append(SectionMissing(
|
||||||
|
section=section_name,
|
||||||
|
severity='ERROR',
|
||||||
|
message=spec.get('error_message', f'{section_name} is required')
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check IMPROPER sections (must not exist)
|
||||||
|
for section_name, spec in sections_spec.items():
|
||||||
|
if spec['classification'] == 'improper':
|
||||||
|
if section_name in doc_sections:
|
||||||
|
issues.append(SectionImproper(
|
||||||
|
section=section_name,
|
||||||
|
severity='ERROR',
|
||||||
|
message=spec.get('error_message', f'{section_name} must not appear')
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check RECOMMENDED sections (warnings)
|
||||||
|
for section_name, spec in sections_spec.items():
|
||||||
|
if spec['classification'] == 'recommended':
|
||||||
|
if section_name not in doc_sections:
|
||||||
|
issues.append(SectionMissing(
|
||||||
|
section=section_name,
|
||||||
|
severity='WARNING',
|
||||||
|
message=spec.get('warning_if_missing', f'{section_name} is recommended')
|
||||||
|
))
|
||||||
|
|
||||||
|
return SectionValidationResult(issues)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Section Classification Enforcement**:
|
||||||
|
- REQUIRED → ERROR if missing
|
||||||
|
- RECOMMENDED → WARNING if missing
|
||||||
|
- OPTIONAL → No check
|
||||||
|
- DISCOURAGED → WARNING if present
|
||||||
|
- IMPROPER → ERROR if present
|
||||||
|
|
||||||
|
### Phase 3: Content Pattern Validator
|
||||||
|
|
||||||
|
**New Module**: `markitect/content_validator.py`
|
||||||
|
|
||||||
|
**Pattern Matching**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class ContentValidator:
|
||||||
|
"""Validates content against x-markitect-content-control rules."""
|
||||||
|
|
||||||
|
def check(self, document: MarkdownDocument) -> ContentValidationResult:
|
||||||
|
content_rules = self.schema.get('x-markitect-content-control', {})
|
||||||
|
issues = []
|
||||||
|
|
||||||
|
for section_key, rules in content_rules.items():
|
||||||
|
section = document.get_section(section_key.upper())
|
||||||
|
if not section:
|
||||||
|
continue # Section validator handles missing sections
|
||||||
|
|
||||||
|
# Check required patterns
|
||||||
|
for pattern in rules.get('required_patterns', []):
|
||||||
|
if not re.search(pattern, section.content):
|
||||||
|
issues.append(PatternMissing(
|
||||||
|
section=section.name,
|
||||||
|
pattern=pattern,
|
||||||
|
severity='ERROR'
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check forbidden patterns
|
||||||
|
for pattern in rules.get('forbidden_patterns', []):
|
||||||
|
if re.search(pattern, section.content):
|
||||||
|
issues.append(ForbiddenPattern(
|
||||||
|
section=section.name,
|
||||||
|
pattern=pattern,
|
||||||
|
severity='ERROR',
|
||||||
|
matched_text=match.group(0)
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check content quality
|
||||||
|
quality = rules.get('content_quality', {})
|
||||||
|
word_count = len(section.content.split())
|
||||||
|
|
||||||
|
if 'min_words' in quality and word_count < quality['min_words']:
|
||||||
|
issues.append(ContentTooShort(
|
||||||
|
section=section.name,
|
||||||
|
actual=word_count,
|
||||||
|
required=quality['min_words'],
|
||||||
|
severity='WARNING'
|
||||||
|
))
|
||||||
|
|
||||||
|
if 'max_words' in quality and word_count > quality['max_words']:
|
||||||
|
issues.append(ContentTooLong(
|
||||||
|
section=section.name,
|
||||||
|
actual=word_count,
|
||||||
|
limit=quality['max_words'],
|
||||||
|
severity='WARNING'
|
||||||
|
))
|
||||||
|
|
||||||
|
return ContentValidationResult(issues)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Content Rules Checked**:
|
||||||
|
- Required patterns (regex matches)
|
||||||
|
- Discouraged patterns (warnings)
|
||||||
|
- Forbidden patterns (errors)
|
||||||
|
- Word count ranges (min/max)
|
||||||
|
- Sentence counts (if specified)
|
||||||
|
|
||||||
|
### Phase 4: Link Validator
|
||||||
|
|
||||||
|
**New Module**: `markitect/link_validator.py`
|
||||||
|
|
||||||
|
**Link Checking**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
class LinkValidator:
|
||||||
|
"""Validates links according to x-markitect-content-control.link_validation."""
|
||||||
|
|
||||||
|
def check(self, document: MarkdownDocument) -> LinkValidationResult:
|
||||||
|
link_config = self.schema.get('x-markitect-content-control', {}).get('link_validation', {})
|
||||||
|
|
||||||
|
if not any(link_config.values()):
|
||||||
|
return LinkValidationResult([]) # No link validation configured
|
||||||
|
|
||||||
|
links = document.extract_links()
|
||||||
|
issues = []
|
||||||
|
|
||||||
|
for link in links:
|
||||||
|
# Check internal links
|
||||||
|
if link.is_internal() and link_config.get('check_internal', False):
|
||||||
|
target = document.resolve_internal_link(link.target)
|
||||||
|
if not target:
|
||||||
|
issues.append(BrokenInternalLink(
|
||||||
|
link=link.target,
|
||||||
|
line=link.line_number,
|
||||||
|
severity='ERROR'
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check external links
|
||||||
|
if link.is_external() and link_config.get('check_external', False):
|
||||||
|
# HTTP HEAD request with timeout
|
||||||
|
if not self._check_url_exists(link.target):
|
||||||
|
issues.append(BrokenExternalLink(
|
||||||
|
link=link.target,
|
||||||
|
line=link.line_number,
|
||||||
|
severity='WARNING' # External links are warnings
|
||||||
|
))
|
||||||
|
|
||||||
|
# Check fragments
|
||||||
|
if link.has_fragment() and not link_config.get('allow_fragments', True):
|
||||||
|
issues.append(FragmentNotAllowed(
|
||||||
|
link=link.target,
|
||||||
|
line=link.line_number,
|
||||||
|
severity='WARNING'
|
||||||
|
))
|
||||||
|
|
||||||
|
return LinkValidationResult(issues)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Link Types Validated**:
|
||||||
|
- Internal links (to other sections/documents)
|
||||||
|
- External links (HTTP/HTTPS URLs)
|
||||||
|
- Fragment identifiers (#section-name)
|
||||||
|
- Email links (mailto:)
|
||||||
|
|
||||||
|
### Phase 5: CLI Integration
|
||||||
|
|
||||||
|
**Enhance Existing Command**: `markitect validate` (cli.py:1493-1600)
|
||||||
|
|
||||||
|
**New Options to Add**:
|
||||||
|
|
||||||
|
```python
|
||||||
|
@cli.command('validate')
|
||||||
|
@click.argument('file_path', type=click.Path(exists=True, path_type=Path))
|
||||||
|
@click.option('--schema', '-s', type=click.Path(exists=True, path_type=Path),
|
||||||
|
help='Path to JSON schema file')
|
||||||
|
@click.option('--schema-json', type=str,
|
||||||
|
help='JSON schema provided as a string')
|
||||||
|
@click.option('--quiet', '-q', is_flag=True,
|
||||||
|
help='Only output validation result (true/false)')
|
||||||
|
@click.option('--detailed-errors', '--errors', is_flag=True,
|
||||||
|
help='Show detailed validation errors (Issue #8)')
|
||||||
|
@click.option('--error-format', type=click.Choice(['text', 'json', 'markdown']), default='text',
|
||||||
|
help='Format for detailed error output')
|
||||||
|
# NEW OPTIONS:
|
||||||
|
@click.option('--semantic/--no-semantic', default=True,
|
||||||
|
help='Enable/disable semantic validation (sections, patterns, quality)')
|
||||||
|
@click.option('--check-links', is_flag=True,
|
||||||
|
help='Enable link validation (may be slow)')
|
||||||
|
@click.option('--strict', is_flag=True,
|
||||||
|
help='Treat warnings as errors')
|
||||||
|
@pass_config
|
||||||
|
def validate(config, file_path, schema, schema_json, quiet, detailed_errors, error_format,
|
||||||
|
semantic, check_links, strict):
|
||||||
|
"""
|
||||||
|
Validate a markdown file against a JSON schema.
|
||||||
|
|
||||||
|
ENHANCED: Now includes semantic validation of x-markitect extensions:
|
||||||
|
- Section classifications (required, recommended, optional, discouraged, improper)
|
||||||
|
- Content patterns (required_patterns, forbidden_patterns)
|
||||||
|
- Quality metrics (min_words, max_words, min_sentences)
|
||||||
|
- Link validation (internal/external)
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
# Structural + semantic validation (default)
|
||||||
|
markitect validate doc.md --schema manpage-schema-v1.0.md
|
||||||
|
|
||||||
|
# Only structural validation (classic mode)
|
||||||
|
markitect validate doc.md --schema schema.json --no-semantic
|
||||||
|
|
||||||
|
# With link checking
|
||||||
|
markitect validate doc.md --schema 1 --check-links
|
||||||
|
|
||||||
|
# Strict mode (warnings become errors)
|
||||||
|
markitect validate doc.md --schema manpage-schema-v1.0.md --strict
|
||||||
|
"""
|
||||||
|
# Existing structural validation code...
|
||||||
|
# (Keep all existing logic for SchemaValidator)
|
||||||
|
|
||||||
|
# NEW: Add semantic validation if enabled and schema has x-markitect extensions
|
||||||
|
if semantic:
|
||||||
|
semantic_validator = SemanticValidator(schema_path)
|
||||||
|
semantic_report = semantic_validator.validate(file_path, check_links=check_links)
|
||||||
|
|
||||||
|
# Combine structural and semantic results
|
||||||
|
combined_report = CombinedValidationReport(structural_result, semantic_report)
|
||||||
|
|
||||||
|
# Output combined results
|
||||||
|
if not quiet:
|
||||||
|
click.echo(combined_report.format(error_format))
|
||||||
|
|
||||||
|
# Exit codes
|
||||||
|
if combined_report.has_errors():
|
||||||
|
sys.exit(1)
|
||||||
|
elif strict and combined_report.has_warnings():
|
||||||
|
sys.exit(1)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Integration Strategy**:
|
||||||
|
1. Keep existing structural validation (SchemaValidator) unchanged
|
||||||
|
2. Add new semantic validation layer on top
|
||||||
|
3. Use --no-semantic flag to disable new validation (backward compatibility)
|
||||||
|
4. Combine structural + semantic results in unified report
|
||||||
|
5. Default to semantic=True for new markdown schemas with extensions
|
||||||
|
|
||||||
|
**Output Format** (text):
|
||||||
|
```
|
||||||
|
Validating: my-command.1.md
|
||||||
|
Schema: manpage-schema-v1.0.md (v1.0.0)
|
||||||
|
|
||||||
|
Section Validation:
|
||||||
|
✅ SYNOPSIS - Present (required)
|
||||||
|
✅ DESCRIPTION - Present (required)
|
||||||
|
⚠️ EXAMPLES - Missing (recommended)
|
||||||
|
❌ INTERNAL_NOTES - Must not appear (improper)
|
||||||
|
|
||||||
|
Content Validation:
|
||||||
|
✅ SYNOPSIS - Patterns matched
|
||||||
|
⚠️ DESCRIPTION - Too short (35 words, minimum 50)
|
||||||
|
❌ SYNOPSIS - Forbidden pattern found: "TODO"
|
||||||
|
|
||||||
|
Link Validation: (skipped - use --check-links)
|
||||||
|
|
||||||
|
Summary:
|
||||||
|
Errors: 2
|
||||||
|
Warnings: 2
|
||||||
|
Status: FAILED ❌
|
||||||
|
|
||||||
|
Failed validations:
|
||||||
|
Line 12: INTERNAL_NOTES section must not appear in published manpages
|
||||||
|
Line 5: SYNOPSIS contains forbidden pattern "TODO"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Phase 6: Batch Document Validation
|
||||||
|
|
||||||
|
**New Command**: `markitect validate-batch`
|
||||||
|
|
||||||
|
```python
|
||||||
|
@cli.command('validate-batch')
|
||||||
|
@click.argument('directory', type=click.Path(exists=True, file_okay=False))
|
||||||
|
@click.option('--schema', '-s', type=str, required=True)
|
||||||
|
@click.option('--pattern', default='*.md', help='File pattern to match')
|
||||||
|
@click.option('--strict', is_flag=True)
|
||||||
|
@click.option('--summary-only', is_flag=True, help='Show only summary table')
|
||||||
|
@pass_config
|
||||||
|
def validate_batch_cmd(config, directory, schema, pattern, strict, summary_only):
|
||||||
|
"""Validate multiple documents in a directory.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
markitect validate-batch docs/manpages/ --schema manpage-schema-v1.0.md
|
||||||
|
"""
|
||||||
|
# Find all matching documents
|
||||||
|
docs = list(Path(directory).glob(pattern))
|
||||||
|
|
||||||
|
# Validate each
|
||||||
|
results = []
|
||||||
|
for doc in docs:
|
||||||
|
validator = DocumentValidator(schema)
|
||||||
|
report = validator.validate(doc)
|
||||||
|
results.append((doc.name, report))
|
||||||
|
|
||||||
|
# Show summary table
|
||||||
|
display_batch_results(results)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Phases
|
||||||
|
|
||||||
|
### Phase 1 (Core - 1 session)
|
||||||
|
- DocumentValidator class
|
||||||
|
- Basic section validation
|
||||||
|
- CLI validate command
|
||||||
|
- Simple text output format
|
||||||
|
|
||||||
|
### Phase 2 (Content - 1 session)
|
||||||
|
- ContentValidator with pattern matching
|
||||||
|
- Word count validation
|
||||||
|
- Quality metrics checking
|
||||||
|
- Enhanced reporting
|
||||||
|
|
||||||
|
### Phase 3 (Links - 1 session)
|
||||||
|
- LinkValidator with internal link checking
|
||||||
|
- Optional external link validation
|
||||||
|
- Fragment validation
|
||||||
|
- Performance optimization (caching)
|
||||||
|
|
||||||
|
### Phase 4 (Polish - 1 session)
|
||||||
|
- Batch validation support
|
||||||
|
- JSON/table output formats
|
||||||
|
- Integration tests
|
||||||
|
- Documentation updates
|
||||||
|
|
||||||
|
## Critical Files
|
||||||
|
|
||||||
|
**New Files**:
|
||||||
|
- `markitect/semantic_validator.py` - Main semantic validator (complements existing SchemaValidator)
|
||||||
|
- `markitect/validators/section_validator.py` - Section classification enforcement
|
||||||
|
- `markitect/validators/content_validator.py` - Content pattern matching and quality
|
||||||
|
- `markitect/validators/link_validator.py` - Link validation
|
||||||
|
- `markitect/validators/__init__.py` - Validators package
|
||||||
|
- `tests/test_semantic_validator.py` - Semantic validator tests
|
||||||
|
- `tests/validators/test_section_validator.py` - Section validator tests
|
||||||
|
- `tests/validators/test_content_validator.py` - Content validator tests
|
||||||
|
- `tests/validators/test_link_validator.py` - Link validator tests
|
||||||
|
|
||||||
|
**Modified Files**:
|
||||||
|
- `markitect/cli.py` (lines 1493-1600) - Enhance validate command with semantic validation
|
||||||
|
- `markitect/schema_loader.py` - May need utility to extract x-markitect extensions
|
||||||
|
- `docs/SCHEMA_MANAGEMENT_GUIDE.md` - Add semantic validation section
|
||||||
|
- `examples/manpages/README.md` - Add validation examples
|
||||||
|
- `examples/terminology/README.md` - Add validation examples
|
||||||
|
|
||||||
|
**Reference Files** (unchanged, used for integration):
|
||||||
|
- `markitect/validator.py` - Existing SchemaValidator for structural validation
|
||||||
|
- `markitect/schema_analyzer.py` - Reference for schema extension parsing
|
||||||
|
|
||||||
|
## Design Decisions
|
||||||
|
|
||||||
|
### 1. Markdown Parsing
|
||||||
|
**Decision**: Use existing markdown parser from markitect core
|
||||||
|
**Rationale**: Already handles frontmatter, sections, AST generation
|
||||||
|
|
||||||
|
### 2. Link Validation Default
|
||||||
|
**Decision**: Internal links checked by default, external links opt-in
|
||||||
|
**Rationale**: External link checking is slow (network requests), internal is fast
|
||||||
|
|
||||||
|
### 3. Severity Levels
|
||||||
|
**Decision**: ERROR (required violations), WARNING (recommended violations), INFO (suggestions)
|
||||||
|
**Rationale**: Matches schema classification system semantics
|
||||||
|
|
||||||
|
### 4. Exit Codes
|
||||||
|
**Decision**: 0=success, 1=validation failed, 2=system error
|
||||||
|
**Rationale**: Standard CLI conventions for CI/CD integration
|
||||||
|
|
||||||
|
### 5. Pattern Syntax
|
||||||
|
**Decision**: Use Python regex patterns directly
|
||||||
|
**Rationale**: Schemas already use regex strings, no need for new syntax
|
||||||
|
|
||||||
|
## Testing Strategy
|
||||||
|
|
||||||
|
### Unit Tests
|
||||||
|
- SectionValidator: Test all classification types
|
||||||
|
- ContentValidator: Test pattern matching, word counts
|
||||||
|
- LinkValidator: Test internal/external link checking
|
||||||
|
- ValidationReport: Test formatting and aggregation
|
||||||
|
|
||||||
|
### Integration Tests
|
||||||
|
- Validate real manpage documents against manpage schema
|
||||||
|
- Validate terminology documents against terminology schema
|
||||||
|
- Test batch validation across multiple documents
|
||||||
|
- Test CLI output formats
|
||||||
|
|
||||||
|
### Edge Cases
|
||||||
|
- Documents with no schema sections defined
|
||||||
|
- Schemas with no content-control rules
|
||||||
|
- Empty documents
|
||||||
|
- Documents with malformed links
|
||||||
|
- Unicode in patterns and content
|
||||||
|
|
||||||
|
## User Workflows
|
||||||
|
|
||||||
|
### Workflow 1: Validate Single Document
|
||||||
|
```bash
|
||||||
|
# Validate a manpage
|
||||||
|
markitect validate my-command.1.md --schema manpage-schema-v1.0.md
|
||||||
|
|
||||||
|
# With link checking
|
||||||
|
markitect validate my-command.1.md --schema 1 --check-links
|
||||||
|
```
|
||||||
|
|
||||||
|
### Workflow 2: CI/CD Integration
|
||||||
|
```bash
|
||||||
|
#!/bin/bash
|
||||||
|
# Validate all manpages in CI
|
||||||
|
if ! markitect validate-batch docs/man/ --schema 1 --strict; then
|
||||||
|
echo "Manpage validation failed!"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
```
|
||||||
|
|
||||||
|
### Workflow 3: Pre-commit Hook
|
||||||
|
```bash
|
||||||
|
# .git/hooks/pre-commit
|
||||||
|
files=$(git diff --cached --name-only --diff-filter=ACM | grep '\.1\.md$')
|
||||||
|
for file in $files; do
|
||||||
|
if ! markitect validate "$file" --schema manpage-schema-v1.0.md; then
|
||||||
|
echo "Fix validation errors before committing"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
```
|
||||||
|
|
||||||
|
### Workflow 4: Interactive Editing
|
||||||
|
```bash
|
||||||
|
# Validate while editing
|
||||||
|
watch -n 2 'markitect validate draft.md --schema api-documentation-schema-v1.0.md'
|
||||||
|
```
|
||||||
|
|
||||||
|
## Success Metrics
|
||||||
|
|
||||||
|
1. **Core Functionality**: Can validate documents against all 4 production schemas
|
||||||
|
2. **Classification Enforcement**: Required/improper sections properly checked
|
||||||
|
3. **Pattern Matching**: Content patterns validated with regex
|
||||||
|
4. **Performance**: Validate 100 documents in < 5 seconds (without link checking)
|
||||||
|
5. **Test Coverage**: > 90% coverage for new validator modules
|
||||||
|
6. **Documentation**: Complete examples for each schema type
|
||||||
|
|
||||||
|
## Future Enhancements (Out of Scope)
|
||||||
|
|
||||||
|
- Auto-fixing document validation errors
|
||||||
|
- Suggestion engine for missing content
|
||||||
|
- Readability scoring with specific algorithms
|
||||||
|
- Image validation (size, format, accessibility)
|
||||||
|
- Schema evolution analysis (breaking changes between versions)
|
||||||
|
- Document-to-schema generation (inverse of current flow)
|
||||||
506
tests/test_semantic_validator.py
Normal file
506
tests/test_semantic_validator.py
Normal file
@@ -0,0 +1,506 @@
|
|||||||
|
"""
|
||||||
|
Tests for SemanticValidator.
|
||||||
|
|
||||||
|
Tests semantic validation of markdown documents against x-markitect extensions.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from pathlib import Path
|
||||||
|
import tempfile
|
||||||
|
import json
|
||||||
|
|
||||||
|
from markitect.semantic_validator import (
|
||||||
|
SemanticValidator,
|
||||||
|
SemanticValidationReport,
|
||||||
|
load_schema_from_path
|
||||||
|
)
|
||||||
|
from markitect.validators.section_validator import (
|
||||||
|
SectionValidator,
|
||||||
|
SectionMissing,
|
||||||
|
SectionImproper
|
||||||
|
)
|
||||||
|
from markitect.validators.content_validator import (
|
||||||
|
ContentValidator,
|
||||||
|
PatternMissing,
|
||||||
|
ForbiddenPattern,
|
||||||
|
DiscouragedPattern,
|
||||||
|
ContentTooShort,
|
||||||
|
ContentTooLong
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSectionValidator:
|
||||||
|
"""Test section validation functionality."""
|
||||||
|
|
||||||
|
def test_required_section_missing(self):
|
||||||
|
"""Test that missing required sections are detected as errors."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-sections': {
|
||||||
|
'SYNOPSIS': {
|
||||||
|
'classification': 'required',
|
||||||
|
'heading_level': 2,
|
||||||
|
'error_message': 'SYNOPSIS section is mandatory'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = SectionValidator(schema)
|
||||||
|
|
||||||
|
# Create a mock document without SYNOPSIS
|
||||||
|
class MockDocument:
|
||||||
|
def get_headings_by_level(self, level):
|
||||||
|
return ['DESCRIPTION', 'EXAMPLES']
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should have one error
|
||||||
|
assert not result.is_valid()
|
||||||
|
assert result.has_errors()
|
||||||
|
assert len(result.get_errors()) == 1
|
||||||
|
|
||||||
|
error = result.get_errors()[0]
|
||||||
|
assert isinstance(error, SectionMissing)
|
||||||
|
assert error.section_name == 'SYNOPSIS'
|
||||||
|
assert error.severity == 'ERROR'
|
||||||
|
assert 'mandatory' in error.message
|
||||||
|
|
||||||
|
def test_improper_section_present(self):
|
||||||
|
"""Test that improper sections are detected as errors."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-sections': {
|
||||||
|
'INTERNAL_NOTES': {
|
||||||
|
'classification': 'improper',
|
||||||
|
'heading_level': 2,
|
||||||
|
'error_message': 'Internal notes must not appear in published docs'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = SectionValidator(schema)
|
||||||
|
|
||||||
|
# Create a mock document with INTERNAL_NOTES
|
||||||
|
class MockDocument:
|
||||||
|
def get_headings_by_level(self, level):
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
'content': 'INTERNAL_NOTES',
|
||||||
|
'level': 2,
|
||||||
|
'line_number': 25
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should have one error
|
||||||
|
assert not result.is_valid()
|
||||||
|
assert result.has_errors()
|
||||||
|
assert len(result.get_errors()) == 1
|
||||||
|
|
||||||
|
error = result.get_errors()[0]
|
||||||
|
assert isinstance(error, SectionImproper)
|
||||||
|
assert error.section_name == 'INTERNAL_NOTES'
|
||||||
|
assert error.severity == 'ERROR'
|
||||||
|
assert error.line_number == 25
|
||||||
|
|
||||||
|
def test_recommended_section_missing(self):
|
||||||
|
"""Test that missing recommended sections generate warnings."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-sections': {
|
||||||
|
'EXAMPLES': {
|
||||||
|
'classification': 'recommended',
|
||||||
|
'heading_level': 2,
|
||||||
|
'warning_if_missing': 'Examples improve documentation quality'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = SectionValidator(schema)
|
||||||
|
|
||||||
|
# Create a mock document without EXAMPLES
|
||||||
|
class MockDocument:
|
||||||
|
def get_headings_by_level(self, level):
|
||||||
|
return ['SYNOPSIS', 'DESCRIPTION']
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should pass validation (warnings don't fail)
|
||||||
|
assert result.is_valid()
|
||||||
|
assert not result.has_errors()
|
||||||
|
assert result.has_warnings()
|
||||||
|
assert len(result.get_warnings()) == 1
|
||||||
|
|
||||||
|
warning = result.get_warnings()[0]
|
||||||
|
assert warning.section_name == 'EXAMPLES'
|
||||||
|
assert warning.severity == 'WARNING'
|
||||||
|
|
||||||
|
def test_all_required_sections_present(self):
|
||||||
|
"""Test that validation passes when all required sections present."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-sections': {
|
||||||
|
'SYNOPSIS': {
|
||||||
|
'classification': 'required',
|
||||||
|
'heading_level': 2
|
||||||
|
},
|
||||||
|
'DESCRIPTION': {
|
||||||
|
'classification': 'required',
|
||||||
|
'heading_level': 2
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = SectionValidator(schema)
|
||||||
|
|
||||||
|
# Create a mock document with all required sections
|
||||||
|
class MockDocument:
|
||||||
|
def get_headings_by_level(self, level):
|
||||||
|
return [
|
||||||
|
{'content': 'SYNOPSIS', 'level': 2},
|
||||||
|
{'content': 'DESCRIPTION', 'level': 2},
|
||||||
|
{'content': 'EXAMPLES', 'level': 2}
|
||||||
|
]
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should pass
|
||||||
|
assert result.is_valid()
|
||||||
|
assert not result.has_errors()
|
||||||
|
assert not result.has_warnings()
|
||||||
|
assert len(result.issues) == 0
|
||||||
|
|
||||||
|
def test_section_alternatives(self):
|
||||||
|
"""Test that alternative section names are recognized."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-sections': {
|
||||||
|
'OPTIONS': {
|
||||||
|
'classification': 'required',
|
||||||
|
'heading_level': 2,
|
||||||
|
'alternatives': ['FLAGS', 'COMMAND OPTIONS']
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = SectionValidator(schema)
|
||||||
|
|
||||||
|
# Document uses alternative name 'FLAGS'
|
||||||
|
class MockDocument:
|
||||||
|
def get_headings_by_level(self, level):
|
||||||
|
return [{'content': 'FLAGS', 'level': 2}]
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should pass (alternative is accepted)
|
||||||
|
assert result.is_valid()
|
||||||
|
assert not result.has_errors()
|
||||||
|
|
||||||
|
|
||||||
|
class TestSemanticValidator:
|
||||||
|
"""Test complete semantic validation."""
|
||||||
|
|
||||||
|
def test_validator_initialization(self):
|
||||||
|
"""Test that validator initializes correctly."""
|
||||||
|
schema = {
|
||||||
|
'$schema': 'http://json-schema.org/draft-07/schema#',
|
||||||
|
'x-markitect-sections': {
|
||||||
|
'SYNOPSIS': {'classification': 'required', 'heading_level': 2}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = SemanticValidator(schema)
|
||||||
|
|
||||||
|
assert validator.schema == schema
|
||||||
|
assert validator.section_validator is not None
|
||||||
|
|
||||||
|
def test_validation_report_formatting(self):
|
||||||
|
"""Test that validation reports format correctly."""
|
||||||
|
from markitect.validators.section_validator import (
|
||||||
|
SectionValidationResult,
|
||||||
|
SectionMissing
|
||||||
|
)
|
||||||
|
|
||||||
|
section_result = SectionValidationResult(
|
||||||
|
issues=[
|
||||||
|
SectionMissing(
|
||||||
|
section_name='SYNOPSIS',
|
||||||
|
severity='ERROR',
|
||||||
|
message='SYNOPSIS is required',
|
||||||
|
classification='required'
|
||||||
|
)
|
||||||
|
],
|
||||||
|
sections_checked=2,
|
||||||
|
sections_found=1
|
||||||
|
)
|
||||||
|
|
||||||
|
report = SemanticValidationReport(section_result=section_result)
|
||||||
|
|
||||||
|
# Check report properties
|
||||||
|
assert report.has_errors()
|
||||||
|
assert not report.is_valid()
|
||||||
|
|
||||||
|
# Check text formatting
|
||||||
|
text = report.format_text()
|
||||||
|
assert 'Section Validation:' in text
|
||||||
|
assert 'SYNOPSIS' in text
|
||||||
|
assert 'Errors: 1' in text
|
||||||
|
assert 'FAILED' in text
|
||||||
|
|
||||||
|
def test_load_json_schema(self, tmp_path):
|
||||||
|
"""Test loading a JSON schema file."""
|
||||||
|
schema_file = tmp_path / "test-schema.json"
|
||||||
|
schema_data = {
|
||||||
|
'$schema': 'http://json-schema.org/draft-07/schema#',
|
||||||
|
'title': 'Test Schema',
|
||||||
|
'x-markitect-sections': {
|
||||||
|
'SYNOPSIS': {'classification': 'required', 'heading_level': 2}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
schema_file.write_text(json.dumps(schema_data, indent=2))
|
||||||
|
|
||||||
|
loaded_schema = load_schema_from_path(schema_file)
|
||||||
|
|
||||||
|
assert loaded_schema == schema_data
|
||||||
|
assert 'x-markitect-sections' in loaded_schema
|
||||||
|
|
||||||
|
def test_schema_not_found(self):
|
||||||
|
"""Test that missing schema file raises error."""
|
||||||
|
with pytest.raises(FileNotFoundError):
|
||||||
|
load_schema_from_path('/nonexistent/schema.json')
|
||||||
|
|
||||||
|
def test_unsupported_schema_format(self, tmp_path):
|
||||||
|
"""Test that unsupported format raises error."""
|
||||||
|
schema_file = tmp_path / "schema.xml"
|
||||||
|
schema_file.write_text('<schema></schema>')
|
||||||
|
|
||||||
|
with pytest.raises(ValueError, match="Unsupported schema format"):
|
||||||
|
load_schema_from_path(schema_file)
|
||||||
|
|
||||||
|
|
||||||
|
class TestContentValidator:
|
||||||
|
"""Test content validation functionality."""
|
||||||
|
|
||||||
|
def test_required_pattern_missing(self):
|
||||||
|
"""Test that missing required patterns are detected."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-content-control': {
|
||||||
|
'synopsis': {
|
||||||
|
'required_patterns': [
|
||||||
|
r'\*\*[a-z][a-z0-9-]*\*\*' # Bold command name
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = ContentValidator(schema)
|
||||||
|
|
||||||
|
# Create mock document without bold command
|
||||||
|
class MockDocument:
|
||||||
|
def get_section(self, name):
|
||||||
|
if name == 'SYNOPSIS':
|
||||||
|
return {
|
||||||
|
'name': 'SYNOPSIS',
|
||||||
|
'content': 'command [options] arguments' # No bold
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should have one error
|
||||||
|
assert not result.is_valid()
|
||||||
|
assert result.has_errors()
|
||||||
|
assert len(result.get_errors()) == 1
|
||||||
|
|
||||||
|
error = result.get_errors()[0]
|
||||||
|
assert isinstance(error, PatternMissing)
|
||||||
|
assert error.section_name == 'SYNOPSIS'
|
||||||
|
assert error.severity == 'ERROR'
|
||||||
|
|
||||||
|
def test_forbidden_pattern_found(self):
|
||||||
|
"""Test that forbidden patterns are detected."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-content-control': {
|
||||||
|
'description': {
|
||||||
|
'forbidden_patterns': [
|
||||||
|
r'\bTODO\b',
|
||||||
|
r'\bFIXME\b'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = ContentValidator(schema)
|
||||||
|
|
||||||
|
# Create mock document with forbidden pattern
|
||||||
|
class MockDocument:
|
||||||
|
def get_section(self, name):
|
||||||
|
if name == 'DESCRIPTION':
|
||||||
|
return {
|
||||||
|
'name': 'DESCRIPTION',
|
||||||
|
'content': 'This is a description. TODO: Add more details.'
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should have one error
|
||||||
|
assert not result.is_valid()
|
||||||
|
assert result.has_errors()
|
||||||
|
assert len(result.get_errors()) == 1
|
||||||
|
|
||||||
|
error = result.get_errors()[0]
|
||||||
|
assert isinstance(error, ForbiddenPattern)
|
||||||
|
assert error.section_name == 'DESCRIPTION'
|
||||||
|
assert 'TODO' in error.matched_text
|
||||||
|
|
||||||
|
def test_discouraged_pattern_warning(self):
|
||||||
|
"""Test that discouraged patterns generate warnings."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-content-control': {
|
||||||
|
'description': {
|
||||||
|
'discouraged_patterns': [
|
||||||
|
r'\bWIP\b'
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = ContentValidator(schema)
|
||||||
|
|
||||||
|
# Create mock document with discouraged pattern
|
||||||
|
class MockDocument:
|
||||||
|
def get_section(self, name):
|
||||||
|
if name == 'DESCRIPTION':
|
||||||
|
return {
|
||||||
|
'name': 'DESCRIPTION',
|
||||||
|
'content': 'This is WIP content.'
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should pass (warnings don't fail)
|
||||||
|
assert result.is_valid()
|
||||||
|
assert not result.has_errors()
|
||||||
|
assert result.has_warnings()
|
||||||
|
|
||||||
|
warning = result.get_warnings()[0]
|
||||||
|
assert isinstance(warning, DiscouragedPattern)
|
||||||
|
assert warning.severity == 'WARNING'
|
||||||
|
|
||||||
|
def test_content_too_short(self):
|
||||||
|
"""Test word count validation - too short."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-content-control': {
|
||||||
|
'description': {
|
||||||
|
'content_quality': {
|
||||||
|
'min_words': 50,
|
||||||
|
'max_words': 1000
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = ContentValidator(schema)
|
||||||
|
|
||||||
|
# Create mock document with short content
|
||||||
|
class MockDocument:
|
||||||
|
def get_section(self, name):
|
||||||
|
if name == 'DESCRIPTION':
|
||||||
|
return {
|
||||||
|
'name': 'DESCRIPTION',
|
||||||
|
'content': 'Short description.' # Only 2 words
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should have warning
|
||||||
|
assert result.is_valid() # Warnings don't fail
|
||||||
|
assert result.has_warnings()
|
||||||
|
|
||||||
|
warning = result.get_warnings()[0]
|
||||||
|
assert isinstance(warning, ContentTooShort)
|
||||||
|
assert warning.actual == 2
|
||||||
|
assert warning.required == 50
|
||||||
|
|
||||||
|
def test_content_too_long(self):
|
||||||
|
"""Test word count validation - too long."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-content-control': {
|
||||||
|
'synopsis': {
|
||||||
|
'content_quality': {
|
||||||
|
'min_words': 5,
|
||||||
|
'max_words': 20
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = ContentValidator(schema)
|
||||||
|
|
||||||
|
# Create mock document with long content
|
||||||
|
class MockDocument:
|
||||||
|
def get_section(self, name):
|
||||||
|
if name == 'SYNOPSIS':
|
||||||
|
return {
|
||||||
|
'name': 'SYNOPSIS',
|
||||||
|
'content': ' '.join(['word'] * 50) # 50 words
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should have warning
|
||||||
|
assert result.is_valid()
|
||||||
|
assert result.has_warnings()
|
||||||
|
|
||||||
|
warning = result.get_warnings()[0]
|
||||||
|
assert isinstance(warning, ContentTooLong)
|
||||||
|
assert warning.actual == 50
|
||||||
|
assert warning.limit == 20
|
||||||
|
|
||||||
|
def test_all_content_requirements_met(self):
|
||||||
|
"""Test that validation passes when all requirements met."""
|
||||||
|
schema = {
|
||||||
|
'x-markitect-content-control': {
|
||||||
|
'synopsis': {
|
||||||
|
'required_patterns': [
|
||||||
|
r'\*\*[a-z]+\*\*'
|
||||||
|
],
|
||||||
|
'content_quality': {
|
||||||
|
'min_words': 5,
|
||||||
|
'max_words': 50
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
validator = ContentValidator(schema)
|
||||||
|
|
||||||
|
# Create valid document
|
||||||
|
class MockDocument:
|
||||||
|
def get_section(self, name):
|
||||||
|
if name == 'SYNOPSIS':
|
||||||
|
return {
|
||||||
|
'name': 'SYNOPSIS',
|
||||||
|
'content': '**command** [options] arguments and more words here'
|
||||||
|
}
|
||||||
|
return None
|
||||||
|
|
||||||
|
doc = MockDocument()
|
||||||
|
result = validator.check(doc)
|
||||||
|
|
||||||
|
# Should pass
|
||||||
|
assert result.is_valid()
|
||||||
|
assert not result.has_errors()
|
||||||
|
assert not result.has_warnings()
|
||||||
|
assert len(result.issues) == 0
|
||||||
Reference in New Issue
Block a user