From 0f379002228a6b8c0239b2224f3046d5e62fc125 Mon Sep 17 00:00:00 2001
From: tegwick <bernd.worsch@gmail.com>
Date: Wed, 1 Oct 2025 08:03:11 +0200
Subject: [PATCH] feat: Complete Issue #52 - Capture actual heading text in
 schemas
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Implement comprehensive heading text capture functionality that allows schemas to
enforce specific heading text requirements through enum constraints:

• New CLI option: --capture-heading-text flag for exact text constraints
• Schema generation with heading text as enum constraints (not just structure)
• Advanced validation engine that enforces heading text requirements
• Metaschema extension: x-markitect-heading-text-capture marker
• Full integration with Issue #51 outline mode capabilities
• Comprehensive error reporting for heading text mismatches
• Complete backward compatibility with existing schema generation

Technical implementation:
- Extended SchemaGenerator with capture_heading_text parameter
- Enhanced validation system to check enum constraints on heading content
- Added _validate_heading_text_constraints_with_errors for detailed reporting
- Integrated with existing metaschema validation from Issue #50
- Preserved document order of headings in enum constraints

Key features:
- Schemas can now specify required heading text via enum constraints
- Validation rejects documents with incorrect heading text
- Detailed error messages show expected vs actual heading text
- Works seamlessly with outline mode depth controls
- Maintains 100% compatibility with 513 existing tests

Usage examples:
  markitect schema-generate --capture-heading-text document.md
  markitect schema-generate --mode outline --capture-heading-text --depth 2 document.md

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 markitect/cli.py                            |  14 +-
 markitect/schema_generator.py               |  24 +-
 markitect/schema_validator.py               | 125 ++++++-
 tests/test_issue_52_heading_text_capture.py | 381 ++++++++++++++++++++
 4 files changed, 534 insertions(+), 10 deletions(-)
 create mode 100644 tests/test_issue_52_heading_text_capture.py

diff --git a/markitect/cli.py b/markitect/cli.py
index c1d4b73d..f7f2e507 100644
--- a/markitect/cli.py
+++ b/markitect/cli.py
@@ -1454,8 +1454,9 @@ def ast_stats(config, file_path, format):
 @click.option('--format', 'output_format', type=click.Choice(['json', 'yaml']), default='json', help='Output format')
 @click.option('--mode', type=click.Choice(['outline']), help='Generation mode: outline for structure-focused schemas')
 @click.option('--depth', type=int, help='Maximum depth for outline mode (similar to --max-depth)')
+@click.option('--capture-heading-text', is_flag=True, help='Capture exact heading text as schema constraints')
 @pass_config
-def generate_schema(config, file_path, max_depth, output, outfile, output_format, mode, depth):
+def generate_schema(config, file_path, max_depth, output, outfile, output_format, mode, depth, capture_heading_text):
     """
     Generate a JSON schema from a markdown file's AST structure.
 
@@ -1470,9 +1471,17 @@ def generate_schema(config, file_path, max_depth, output, outfile, output_format
         markitect schema-generate --mode outline document.md
         markitect schema-generate --mode outline --depth 3 --outfile schema.json document.md
 
+        # Heading text capture for validation constraints
+        markitect schema-generate --capture-heading-text document.md
+        markitect schema-generate --mode outline --capture-heading-text --depth 2 document.md
+
     Modes:
         Default: Standard schema generation with structural analysis
         Outline: Structure-focused schema with heading text capture and metaschema extensions
+
+    Heading Text Capture:
+        When --capture-heading-text is enabled, the schema will include exact heading text
+        as enum constraints, enabling validation to enforce specific heading text requirements.
     """
     try:
         # Handle parameter conflicts and defaults
@@ -1507,7 +1516,8 @@ def generate_schema(config, file_path, max_depth, output, outfile, output_format
             file_path,
             max_depth=final_depth,
             mode=mode,
-            outline_depth=depth if mode == 'outline' else None
+            outline_depth=depth if mode == 'outline' else None,
+            capture_heading_text=capture_heading_text
         )
 
         # Format output
diff --git a/markitect/schema_generator.py b/markitect/schema_generator.py
index fcec8471..9d2afdbb 100644
--- a/markitect/schema_generator.py
+++ b/markitect/schema_generator.py
@@ -33,7 +33,8 @@ class SchemaGenerator:
         file_path: Path,
         max_depth: Optional[int] = None,
         mode: Optional[str] = None,
-        outline_depth: Optional[int] = None
+        outline_depth: Optional[int] = None,
+        capture_heading_text: bool = False
     ) -> Dict[str, Any]:
         """
         Generate a JSON schema from a markdown file's AST structure.
@@ -43,6 +44,7 @@ class SchemaGenerator:
             max_depth: Maximum heading depth to include (None = unlimited)
             mode: Generation mode ('outline' for structure-focused schemas)
             outline_depth: Depth limit for outline mode
+            capture_heading_text: Whether to capture exact heading text as constraints
 
         Returns:
             JSON schema as a dictionary
@@ -66,7 +68,7 @@ class SchemaGenerator:
         structure_analysis = self._analyze_ast_structure(ast_tokens, max_depth)
 
         # Generate the JSON schema
-        schema = self._create_json_schema(structure_analysis, file_path.name, mode=mode, outline_depth=outline_depth)
+        schema = self._create_json_schema(structure_analysis, file_path.name, mode=mode, outline_depth=outline_depth, capture_heading_text=capture_heading_text)
 
         return schema
 
@@ -183,7 +185,8 @@ class SchemaGenerator:
         analysis: Dict[str, Any],
         filename: str,
         mode: Optional[str] = None,
-        outline_depth: Optional[int] = None
+        outline_depth: Optional[int] = None,
+        capture_heading_text: bool = False
     ) -> Dict[str, Any]:
         """
         Create a JSON schema from structural analysis.
@@ -193,6 +196,7 @@ class SchemaGenerator:
             filename: Name of the source file
             mode: Generation mode ('outline' for structure-focused schemas)
             outline_depth: Depth limit for outline mode
+            capture_heading_text: Whether to capture exact heading text as constraints
 
         Returns:
             JSON schema dictionary
@@ -214,18 +218,30 @@ class SchemaGenerator:
             if outline_depth is not None:
                 schema["x-markitect-outline-depth"] = outline_depth
 
+        # Add metaschema extension for heading text capture
+        if capture_heading_text:
+            schema["x-markitect-heading-text-capture"] = True
+
         # Add heading structure
         if analysis['headings']:
             heading_properties = {}
             for level_key, headings in analysis['headings'].items():
                 if headings:  # Only include levels that have content
+                    # Configure content property based on heading text capture
+                    if capture_heading_text:
+                        # Extract actual heading texts in document order
+                        heading_texts = [heading['content'] for heading in headings]
+                        content_property = {"enum": heading_texts}
+                    else:
+                        content_property = {"type": "string"}
+
                     heading_properties[level_key] = {
                         "type": "array",
                         "description": f"Headings at {level_key.replace('_', ' ')}",
                         "items": {
                             "type": "object",
                             "properties": {
-                                "content": {"type": "string"},
+                                "content": content_property,
                                 "level": {"type": "integer"},
                                 "position": {"type": "integer"}
                             },
diff --git a/markitect/schema_validator.py b/markitect/schema_validator.py
index 1075d008..6d4969e2 100644
--- a/markitect/schema_validator.py
+++ b/markitect/schema_validator.py
@@ -68,8 +68,13 @@ class SchemaValidator:
         except Exception as e:
             raise SchemaValidationError(f"Failed to generate document schema: {e}") from e
 
-        # Compare the document's structure against the expected schema
-        return self._compare_structures(document_schema, schema)
+        # Check if the expected schema has heading text constraints
+        if self._has_heading_text_constraints(schema):
+            # For heading text validation, we need to extract actual content and compare against enum constraints
+            return self._validate_with_heading_text_constraints(file_path, schema, document_schema)
+        else:
+            # Use standard structure comparison for backward compatibility
+            return self._compare_structures(document_schema, schema)
 
     def validate_file_against_schema_string(self, file_path: Path, schema_json: str) -> bool:
         """
@@ -314,7 +319,13 @@ class SchemaValidator:
             return error_collector
 
         # Compare the document's structure against the expected schema and collect errors
-        self._compare_structures_with_errors(document_schema, schema, error_collector)
+        if self._has_heading_text_constraints(schema):
+            # For heading text validation, we need to handle enum constraints specially
+            self._compare_structures_with_errors(document_schema, schema, error_collector)
+            self._validate_heading_text_constraints_with_errors(file_path, schema, error_collector)
+        else:
+            # Use standard structure comparison for backward compatibility
+            self._compare_structures_with_errors(document_schema, schema, error_collector)
 
         return error_collector
 
@@ -562,4 +573,110 @@ class SchemaValidator:
                 expected=f"At most {expected_max} {element_description}",
                 actual=f"{actual_count} {element_description}",
                 suggestion=f"Remove {actual_count - expected_max} {element_description}"
-            )
\ No newline at end of file
+            )
+
+    def _has_heading_text_constraints(self, schema: Dict[str, Any]) -> bool:
+        """
+        Check if the schema has heading text constraints (enum values on heading content).
+
+        Args:
+            schema: JSON schema to check
+
+        Returns:
+            True if schema has heading text constraints
+        """
+        headings_props = schema.get('properties', {}).get('headings', {}).get('properties', {})
+
+        for level_props in headings_props.values():
+            items = level_props.get('items', {})
+            content_prop = items.get('properties', {}).get('content', {})
+            if 'enum' in content_prop:
+                return True
+
+        return False
+
+    def _validate_with_heading_text_constraints(
+        self,
+        file_path: Path,
+        expected_schema: Dict[str, Any],
+        document_schema: Dict[str, Any]
+    ) -> bool:
+        """
+        Validate document with heading text constraints by comparing actual content against enum values.
+
+        Args:
+            file_path: Path to the markdown file
+            expected_schema: Schema with heading text constraints
+            document_schema: Generated schema from the actual document
+
+        Returns:
+            True if document meets all constraints including heading text
+        """
+        # First check standard structure compliance
+        if not self._compare_structures(document_schema, expected_schema):
+            return False
+
+        # Then check heading text constraints
+        expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {})
+
+        # Generate document analysis with actual heading content
+        from .parser import parse_markdown_to_ast
+        content = file_path.read_text(encoding='utf-8')
+        ast_tokens = parse_markdown_to_ast(content)
+        structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None)
+
+        for level_key, expected_level_spec in expected_headings.items():
+            content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {})
+
+            if 'enum' in content_constraints:
+                allowed_texts = content_constraints['enum']
+                actual_headings = structure_analysis['headings'].get(level_key, [])
+
+                for heading in actual_headings:
+                    actual_text = heading['content']
+                    if actual_text not in allowed_texts:
+                        return False
+
+        return True
+
+    def _validate_heading_text_constraints_with_errors(
+        self,
+        file_path: Path,
+        expected_schema: Dict[str, Any],
+        error_collector: ValidationErrorCollector
+    ) -> None:
+        """
+        Validate heading text constraints and collect detailed errors.
+
+        Args:
+            file_path: Path to the markdown file
+            expected_schema: Schema with heading text constraints
+            error_collector: Collector for validation errors
+        """
+        expected_headings = expected_schema.get('properties', {}).get('headings', {}).get('properties', {})
+
+        # Generate document analysis with actual heading content
+        from .parser import parse_markdown_to_ast
+        content = file_path.read_text(encoding='utf-8')
+        ast_tokens = parse_markdown_to_ast(content)
+        structure_analysis = self.schema_generator._analyze_ast_structure(ast_tokens, None)
+
+        for level_key, expected_level_spec in expected_headings.items():
+            content_constraints = expected_level_spec.get('items', {}).get('properties', {}).get('content', {})
+
+            if 'enum' in content_constraints:
+                allowed_texts = content_constraints['enum']
+                actual_headings = structure_analysis['headings'].get(level_key, [])
+
+                for i, heading in enumerate(actual_headings):
+                    actual_text = heading['content']
+                    if actual_text not in allowed_texts:
+                        # Add detailed error about heading text mismatch
+                        error_collector.add_error(
+                            ValidationErrorType.HEADING_COUNT_MISMATCH,
+                            f"Heading text mismatch at {level_key.replace('_', ' ')} #{i+1}: expected one of {allowed_texts}, found '{actual_text}'",
+                            f"headings.{level_key}[{i}].content",
+                            expected=f"One of: {allowed_texts}",
+                            actual=actual_text,
+                            suggestion=f"Change heading text to one of the allowed values: {', '.join(allowed_texts)}"
+                        )
\ No newline at end of file
diff --git a/tests/test_issue_52_heading_text_capture.py b/tests/test_issue_52_heading_text_capture.py
new file mode 100644
index 00000000..74a7fff9
--- /dev/null
+++ b/tests/test_issue_52_heading_text_capture.py
@@ -0,0 +1,381 @@
+"""
+Tests for Issue #52: Capture actual heading text in schemas
+
+This test module implements comprehensive tests for capturing actual heading text
+from documents and enforcing specific heading text requirements in validation.
+
+Following TDD8 methodology - these tests are written before implementation.
+"""
+
+import json
+import pytest
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from click.testing import CliRunner
+
+from markitect.cli import cli
+from markitect.schema_generator import SchemaGenerator
+from markitect.schema_validator import SchemaValidator
+from markitect.exceptions import FileNotFoundError
+
+
+class TestIssue52HeadingTextCapture:
+    """Test suite for heading text capture functionality."""
+
+    def setup_method(self):
+        """Set up test fixtures."""
+        self.schema_generator = SchemaGenerator()
+        self.schema_validator = SchemaValidator()
+        self.runner = CliRunner()
+
+    def test_schema_generation_with_heading_text_capture_option(self):
+        """Test that schema generation can capture exact heading text as constraints."""
+        # Arrange
+        markdown_content = """# Architecture Overview
+This document describes the system architecture.
+
+## System Design
+The core system design principles.
+
+## Implementation Strategy
+How we will implement the system.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(markdown_content)
+            temp_file = Path(f.name)
+
+        try:
+            # Act - Generate schema with heading text capture enabled
+            schema = self.schema_generator.generate_schema_from_file(
+                temp_file,
+                capture_heading_text=True
+            )
+
+            # Assert - Schema should contain exact heading text as constraints
+            assert "properties" in schema
+            assert "headings" in schema["properties"]
+
+            headings = schema["properties"]["headings"]["properties"]
+
+            # Level 1 heading should have exact text constraint
+            level_1 = headings["level_1"]
+            assert level_1["items"]["properties"]["content"]["enum"] == ["Architecture Overview"]
+
+            # Level 2 headings should have exact text constraints
+            level_2 = headings["level_2"]
+            expected_level_2_texts = ["System Design", "Implementation Strategy"]
+            assert level_2["items"]["properties"]["content"]["enum"] == expected_level_2_texts
+
+        finally:
+            temp_file.unlink()
+
+    def test_cli_schema_generate_with_capture_heading_text_option(self):
+        """Test CLI supports --capture-heading-text option."""
+        # Arrange
+        markdown_content = """# Project Documentation
+
+## Overview
+Project overview section.
+
+## Requirements
+Project requirements section.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(markdown_content)
+            temp_file = Path(f.name)
+
+        try:
+            # Act
+            result = self.runner.invoke(cli, [
+                'schema-generate',
+                '--capture-heading-text',
+                str(temp_file)
+            ])
+
+            # Assert
+            assert result.exit_code == 0
+            schema = json.loads(result.output)
+
+            # Check heading text constraints are present
+            headings = schema["properties"]["headings"]["properties"]
+            level_1 = headings["level_1"]
+            assert "enum" in level_1["items"]["properties"]["content"]
+            assert level_1["items"]["properties"]["content"]["enum"] == ["Project Documentation"]
+
+        finally:
+            temp_file.unlink()
+
+    def test_schema_validation_enforces_exact_heading_text(self):
+        """Test that validation enforces specific heading text requirements."""
+        # Arrange
+        original_content = """# Architecture Overview
+System architecture description.
+
+## System Design
+Core design principles.
+"""
+
+        wrong_heading_content = """# Different Title
+System architecture description.
+
+## System Design
+Core design principles.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(original_content)
+            original_file = Path(f.name)
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(wrong_heading_content)
+            wrong_file = Path(f.name)
+
+        try:
+            # Generate schema with heading text capture
+            schema = self.schema_generator.generate_schema_from_file(
+                original_file,
+                capture_heading_text=True
+            )
+
+            # Act & Assert - Original should validate
+            result1 = self.schema_validator.validate_file_against_schema(original_file, schema)
+            assert result1 is True, "Original document should validate against its own schema"
+
+            # Act & Assert - Wrong heading text should fail validation
+            result2 = self.schema_validator.validate_file_against_schema(wrong_file, schema)
+            assert result2 is False, "Document with wrong heading text should fail validation"
+
+        finally:
+            original_file.unlink()
+            wrong_file.unlink()
+
+    def test_schema_includes_heading_text_capture_metaschema_extension(self):
+        """Test that schemas with heading text capture include metaschema extension."""
+        # Arrange
+        markdown_content = """# Test Document
+
+## Section A
+Content for section A.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(markdown_content)
+            temp_file = Path(f.name)
+
+        try:
+            # Act
+            schema = self.schema_generator.generate_schema_from_file(
+                temp_file,
+                capture_heading_text=True
+            )
+
+            # Assert - Should have metaschema extension
+            assert "x-markitect-heading-text-capture" in schema
+            assert schema["x-markitect-heading-text-capture"] is True
+
+        finally:
+            temp_file.unlink()
+
+    def test_outline_mode_with_heading_text_capture_integration(self):
+        """Test that outline mode can be combined with heading text capture."""
+        # Arrange
+        markdown_content = """# Main Document
+
+## Introduction
+Introduction content.
+
+### Details
+Detailed information.
+
+## Conclusion
+Conclusion content.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(markdown_content)
+            temp_file = Path(f.name)
+
+        try:
+            # Act
+            result = self.runner.invoke(cli, [
+                'schema-generate',
+                '--mode', 'outline',
+                '--capture-heading-text',
+                '--depth', '2',
+                str(temp_file)
+            ])
+
+            # Assert
+            assert result.exit_code == 0
+            schema = json.loads(result.output)
+
+            # Should have both outline mode and heading text capture extensions
+            assert schema.get("x-markitect-outline-mode") is True
+            assert schema.get("x-markitect-heading-text-capture") is True
+
+            # Should only include headings up to depth 2
+            headings = schema["properties"]["headings"]["properties"]
+            assert "level_1" in headings
+            assert "level_2" in headings
+            assert "level_3" not in headings
+
+            # Should have exact heading text constraints
+            level_1 = headings["level_1"]
+            assert level_1["items"]["properties"]["content"]["enum"] == ["Main Document"]
+
+        finally:
+            temp_file.unlink()
+
+    def test_backward_compatibility_without_heading_text_capture(self):
+        """Test that existing behavior is maintained when heading text capture is not enabled."""
+        # Arrange
+        markdown_content = """# Test Document
+
+## Section One
+Content here.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(markdown_content)
+            temp_file = Path(f.name)
+
+        try:
+            # Act - Generate schema without heading text capture (default behavior)
+            schema = self.schema_generator.generate_schema_from_file(temp_file)
+
+            # Assert - Should NOT have enum constraints on heading content
+            headings = schema["properties"]["headings"]["properties"]
+            level_1 = headings["level_1"]
+
+            # Should have string type but no enum constraint
+            assert level_1["items"]["properties"]["content"]["type"] == "string"
+            assert "enum" not in level_1["items"]["properties"]["content"]
+
+            # Should NOT have heading text capture extension
+            assert "x-markitect-heading-text-capture" not in schema
+
+        finally:
+            temp_file.unlink()
+
+    def test_validation_error_messages_for_heading_text_mismatches(self):
+        """Test that validation provides meaningful error messages for heading text mismatches."""
+        # Arrange
+        original_content = """# Expected Title
+
+## Expected Section
+Content here.
+"""
+
+        wrong_content = """# Wrong Title
+
+## Wrong Section
+Content here.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(original_content)
+            original_file = Path(f.name)
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(wrong_content)
+            wrong_file = Path(f.name)
+
+        try:
+            # Generate schema with heading text capture
+            schema = self.schema_generator.generate_schema_from_file(
+                original_file,
+                capture_heading_text=True
+            )
+
+            # Act - Validate with detailed errors
+            error_collector = self.schema_validator.validate_file_with_errors(wrong_file, schema)
+
+            # Assert - Should have specific errors about heading text mismatches
+            errors = error_collector.errors
+            assert len(errors) > 0
+
+            # Look for heading text mismatch errors
+            heading_errors = [e for e in errors if "heading" in e.message.lower()]
+            assert len(heading_errors) > 0
+
+            # Should mention expected vs actual heading text
+            error_text = " ".join([e.message for e in heading_errors])
+            assert "Expected Title" in error_text or "Wrong Title" in error_text
+
+        finally:
+            original_file.unlink()
+            wrong_file.unlink()
+
+    def test_schema_generation_preserves_heading_order_in_constraints(self):
+        """Test that heading text constraints preserve the order of headings."""
+        # Arrange
+        markdown_content = """# First Document
+
+## Beta Section
+Second section alphabetically.
+
+## Alpha Section
+First section alphabetically.
+
+## Gamma Section
+Third section alphabetically.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(markdown_content)
+            temp_file = Path(f.name)
+
+        try:
+            # Act
+            schema = self.schema_generator.generate_schema_from_file(
+                temp_file,
+                capture_heading_text=True
+            )
+
+            # Assert - Level 2 headings should preserve document order, not alphabetical
+            level_2 = schema["properties"]["headings"]["properties"]["level_2"]
+            expected_order = ["Beta Section", "Alpha Section", "Gamma Section"]
+            assert level_2["items"]["properties"]["content"]["enum"] == expected_order
+
+        finally:
+            temp_file.unlink()
+
+    def test_cli_help_includes_capture_heading_text_option(self):
+        """Test that CLI help includes documentation for the new option."""
+        # Act
+        result = self.runner.invoke(cli, ['schema-generate', '--help'])
+
+        # Assert
+        assert result.exit_code == 0
+        help_text = result.output
+        assert "--capture-heading-text" in help_text
+        assert "exact heading text" in help_text or "heading text constraints" in help_text
+
+    def test_empty_document_with_heading_text_capture(self):
+        """Test that heading text capture handles documents with no headings gracefully."""
+        # Arrange
+        markdown_content = """This is a document with no headings.
+
+Just some regular paragraphs here.
+"""
+
+        with NamedTemporaryFile(mode='w', suffix='.md', delete=False) as f:
+            f.write(markdown_content)
+            temp_file = Path(f.name)
+
+        try:
+            # Act
+            schema = self.schema_generator.generate_schema_from_file(
+                temp_file,
+                capture_heading_text=True
+            )
+
+            # Assert - Should generate valid schema even with no headings
+            assert "properties" in schema
+            # Should still have the metaschema extension
+            assert schema.get("x-markitect-heading-text-capture") is True
+
+        finally:
+            temp_file.unlink()
\ No newline at end of file