feat: Complete Issue #52 - Capture actual heading text in schemas

Implement comprehensive heading text capture functionality that allows schemas to
enforce specific heading text requirements through enum constraints:

• New CLI option: --capture-heading-text flag for exact text constraints
• Schema generation with heading text as enum constraints (not just structure)
• Advanced validation engine that enforces heading text requirements
• Metaschema extension: x-markitect-heading-text-capture marker
• Full integration with Issue #51 outline mode capabilities
• Comprehensive error reporting for heading text mismatches
• Complete backward compatibility with existing schema generation

Technical implementation:
- Extended SchemaGenerator with capture_heading_text parameter
- Enhanced validation system to check enum constraints on heading content
- Added _validate_heading_text_constraints_with_errors for detailed reporting
- Integrated with existing metaschema validation from Issue #50
- Preserved document order of headings in enum constraints

Key features:
- Schemas can now specify required heading text via enum constraints
- Validation rejects documents with incorrect heading text
- Detailed error messages show expected vs actual heading text
- Works seamlessly with outline mode depth controls
- Maintains 100% compatibility with 513 existing tests

Usage examples:
  markitect schema-generate --capture-heading-text document.md
  markitect schema-generate --mode outline --capture-heading-text --depth 2 document.md

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-01 08:03:11 +02:00
parent b5f510f9c7
commit 0f37900222
4 changed files with 534 additions and 10 deletions

View File

@@ -68,8 +68,13 @@ class SchemaValidator:
except Exception as e:
raise SchemaValidationError(f"Failed to generate document schema: {e}") from e
# Compare the document's structure against the expected schema
return self._compare_structures(document_schema, schema)
# Check if the expected schema has heading text constraints
if self._has_heading_text_constraints(schema):
# For heading text validation, we need to extract actual content and compare against enum constraints
return self._validate_with_heading_text_constraints(file_path, schema, document_schema)
else:
# Use standard structure comparison for backward compatibility
return self._compare_structures(document_schema, schema)
def validate_file_against_schema_string(self, file_path: Path, schema_json: str) -> bool:
"""
@@ -314,7 +319,13 @@ class SchemaValidator:
return error_collector
# Compare the document's structure against the expected schema and collect errors
self._compare_structures_with_errors(document_schema, schema, error_collector)
if self._has_heading_text_constraints(schema):
# For heading text validation, we need to handle enum constraints specially
self._compare_structures_with_errors(document_schema, schema, error_collector)
self._validate_heading_text_constraints_with_errors(file_path, schema, error_collector)
else:
# Use standard structure comparison for backward compatibility
self._compare_structures_with_errors(document_schema, schema, error_collector)
return error_collector
@@ -562,4 +573,110 @@ class SchemaValidator:
expected=f"At most {expected_max} {element_description}",
actual=f"{actual_count} {element_description}",
suggestion=f"Remove {actual_count - expected_max} {element_description}"
)
)
def _has_heading_text_constraints(self, schema: Dict[str, Any]) -> bool:
"""
Check if the schema has heading text constraints (enum values on heading content).
Args:
schema: JSON schema to check
Returns:
True if schema has heading text constraints
"""
headings_props = schema.get('properties', {}).get('headings', {}).get('properties', {})
for level_props in headings_props.values():
items = level_props.get('items', {})
content_prop = items.get('properties', {}).get('content', {})
if 'enum' in content_prop:
return True
return False
def _validate_with_heading_text_constraints(
self,
file_path: Path,
expected_schema: Dict[str, Any],
document_schema: Dict[str, Any]
) -> bool:
"""
Validate document with heading text constraints by comparing actual content against enum values.
Args:
file_path: Path to the markdown file
expected_schema: Schema with heading text constraints
document_schema: Generated schema from the actual document
Returns:
True if document meets all constraints including heading text
"""
# First check standard structure compliance
if not self._compare_structures(document_schema, expected_schema):
return False
# Then check heading text constraints
expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {})
# Generate document analysis with actual heading content
from .parser import parse_markdown_to_ast
content = file_path.read_text(encoding='utf-8')
ast_tokens = parse_markdown_to_ast(content)
structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None)
for level_key, expected_level_spec in expected_headings.items():
content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {})
if 'enum' in content_constraints:
allowed_texts = content_constraints['enum']
actual_headings = structure_analysis['headings'].get(level_key, [])
for heading in actual_headings:
actual_text = heading['content']
if actual_text not in allowed_texts:
return False
return True
def _validate_heading_text_constraints_with_errors(
self,
file_path: Path,
expected_schema: Dict[str, Any],
error_collector: ValidationErrorCollector
) -> None:
"""
Validate heading text constraints and collect detailed errors.
Args:
file_path: Path to the markdown file
expected_schema: Schema with heading text constraints
error_collector: Collector for validation errors
"""
expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {})
# Generate document analysis with actual heading content
from .parser import parse_markdown_to_ast
content = file_path.read_text(encoding='utf-8')
ast_tokens = parse_markdown_to_ast(content)
structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None)
for level_key, expected_level_spec in expected_headings.items():
content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {})
if 'enum' in content_constraints:
allowed_texts = content_constraints['enum']
actual_headings = structure_analysis['headings'].get(level_key, [])
for i, heading in enumerate(actual_headings):
actual_text = heading['content']
if actual_text not in allowed_texts:
# Add detailed error about heading text mismatch
error_collector.add_error(
ValidationErrorType.HEADING_COUNT_MISMATCH,
f"Heading text mismatch at {level_key.replace('_', ' ')} #{i+1}: expected one of {allowed_texts}, found '{actual_text}'",
f"headings.{level_key}[{i}].content",
expected=f"One of: {allowed_texts}",
actual=actual_text,
suggestion=f"Change heading text to one of the allowed values: {', '.join(allowed_texts)}"
)