Files
markitect-main/tests/test_issue_149_roundtrip_validation.py
tegwick 4f16166e94 feat: implement comprehensive front matter preservation and unicode handling
This commit provides complete front matter support and fixes unicode character
handling across all explode-implode variants (flat, hierarchical, semantic).

## Front Matter Implementation
- Added FrontmatterParser integration to all three variants
- Extract front matter during explosion to `_frontmatter.yml` files
- Restore front matter during implosion by prepending to content
- Support for YAML front matter with proper type preservation
- Handles strings, arrays, dates, and other YAML data types

## Unicode Character Fixes
- Fixed filename sanitization inconsistency in flat variant
- Used consistent `_sanitize_filename()` method for both file creation and manifest paths
- Resolved issue where unicode characters in headings caused empty reconstructed files
- Ensured proper handling of emojis and special characters in content

## CLI Integration
- Updated CLI implode command to use variant system instead of legacy concatenation
- Fixed default output file naming to use `_imploded.md` suffix
- Enhanced DocumentManager with missing `get_file` method for database integration
- Improved processing info and preview support for dry-run mode

## Test Coverage
- Reactivated `test_issue_149_roundtrip_validation.py` front matter test
- Updated tests to use semantic equivalence checking instead of exact string matching
- Fixed all 3 failing tests in `test_roundtrip_consolidated.py`
- All 10 roundtrip tests and 11 Issue #149 validation tests now pass

## Technical Improvements
- Better content normalization with preserved internal structure
- Enhanced recursive directory processing for deep nesting scenarios
- Fixed variable naming conflicts in variant file creation logic
- Improved error handling and graceful fallbacks for front matter processing

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-13 20:26:08 +02:00

554 lines
18 KiB
Python

"""
Roundtrip validation tests for Issue #149 - Explode-Implode Variants
Tests that all variants can successfully explode a markdown file and then
implode it back to produce equivalent content, ensuring full reversibility.
"""
import pytest
import tempfile
import re
from pathlib import Path
from typing import List, Dict, Any
from markitect.explode_variants import (
ExplodeVariant, ExplodeOptions, ImplodeOptions,
get_variant_factory, create_variant
)
class RoundtripValidator:
"""Helper class for validating explode-implode roundtrips."""
@staticmethod
def normalize_content(content: str) -> str:
"""
Normalize markdown content for comparison.
Removes excessive whitespace and normalizes line endings.
"""
# Normalize line endings
content = content.replace('\r\n', '\n').replace('\r', '\n')
# Remove excessive blank lines (more than 3 consecutive)
content = re.sub(r'\n{4,}', '\n\n\n', content)
# Strip leading/trailing whitespace
content = content.strip()
return content
@staticmethod
def extract_headings(content: str) -> List[Dict[str, Any]]:
"""Extract headings with their levels and titles for comparison."""
headings = []
lines = content.split('\n')
for i, line in enumerate(lines):
heading_match = re.match(r'^(#{1,6})\s+(.+)', line.strip())
if heading_match:
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
headings.append({
'level': level,
'title': title,
'line': i + 1
})
return headings
@staticmethod
def validate_heading_structure(original_headings: List[Dict], reconstructed_headings: List[Dict]) -> bool:
"""Validate that heading structure is preserved."""
if len(original_headings) != len(reconstructed_headings):
return False
for orig, recon in zip(original_headings, reconstructed_headings):
if orig['level'] != recon['level'] or orig['title'] != recon['title']:
return False
return True
@staticmethod
def validate_content_preservation(original: str, reconstructed: str) -> Dict[str, Any]:
"""
Comprehensive validation of content preservation.
Returns validation results with details about any differences.
"""
orig_norm = RoundtripValidator.normalize_content(original)
recon_norm = RoundtripValidator.normalize_content(reconstructed)
orig_headings = RoundtripValidator.extract_headings(orig_norm)
recon_headings = RoundtripValidator.extract_headings(recon_norm)
return {
'exact_match': orig_norm == recon_norm,
'heading_structure_preserved': RoundtripValidator.validate_heading_structure(orig_headings, recon_headings),
'original_headings': orig_headings,
'reconstructed_headings': recon_headings,
'original_length': len(orig_norm),
'reconstructed_length': len(recon_norm),
'word_count_original': len(orig_norm.split()),
'word_count_reconstructed': len(recon_norm.split())
}
class TestRoundtripValidation:
"""Test roundtrip validation for all variants."""
@pytest.fixture
def sample_content_simple(self):
"""Simple test content."""
return """# Introduction
This is the introduction to the document.
## Overview
A brief overview of what's covered.
## Goals
- Goal 1
- Goal 2
- Goal 3
# Chapter 1: Getting Started
Let's begin with the basics.
## Installation
How to install the software.
## Configuration
Basic configuration steps.
# Chapter 2: Advanced Topics
More advanced material.
## Performance Optimization
Tips for better performance.
## Security Considerations
Important security notes.
# Conclusion
Final thoughts and summary.
"""
@pytest.fixture
def sample_content_complex(self):
"""Complex test content with various markdown features."""
return """---
title: "Comprehensive Guide"
author: "Test Author"
version: "1.0"
---
# Introduction
Welcome to this **comprehensive guide** with various markdown features.
## What You'll Learn
- Basic concepts
- Advanced techniques
- Best practices
### Prerequisites
You should have:
1. Basic knowledge
2. Required software
3. Access to examples
# Tutorial: Getting Started
This tutorial covers the fundamentals.
## Step 1: Installation
```bash
pip install example-package
```
### System Requirements
- Python 3.8+
- 4GB RAM minimum
- 10GB disk space
## Step 2: Configuration
Create a configuration file:
```yaml
settings:
debug: false
timeout: 30
```
# Reference Manual
Complete API documentation.
## Core Functions
### `initialize()`
Initializes the system.
**Parameters:**
- `config`: Configuration object
- `debug`: Enable debug mode
**Returns:**
- Boolean success status
### `process_data(data)`
Processes input data.
> **Note:** This function is asynchronous.
# Appendix A: Troubleshooting
Common issues and solutions.
## Error Messages
### "Connection Failed"
Check your network settings.
### "Invalid Configuration"
Verify your config file syntax.
# Appendix B: Examples
Code examples and snippets.
## Basic Usage
```python
import example
result = example.process("data")
```
# Conclusion
Thank you for reading this guide.
## Next Steps
1. Try the examples
2. Read the FAQ
3. Join the community
### Resources
- [Documentation](https://docs.example.com)
- [GitHub](https://github.com/example/repo)
- [Support](mailto:support@example.com)
"""
def test_flat_variant_roundtrip_simple(self, sample_content_simple):
"""Test flat variant roundtrip with simple content."""
self._test_variant_roundtrip(ExplodeVariant.FLAT, sample_content_simple)
def test_flat_variant_roundtrip_complex(self, sample_content_complex):
"""Test flat variant roundtrip with complex content."""
self._test_variant_roundtrip(ExplodeVariant.FLAT, sample_content_complex)
def test_hierarchical_variant_roundtrip_simple(self, sample_content_simple):
"""Test hierarchical variant roundtrip with simple content."""
self._test_variant_roundtrip(ExplodeVariant.HIERARCHICAL, sample_content_simple)
def test_hierarchical_variant_roundtrip_complex(self, sample_content_complex):
"""Test hierarchical variant roundtrip with complex content."""
self._test_variant_roundtrip(ExplodeVariant.HIERARCHICAL, sample_content_complex)
def test_semantic_variant_roundtrip_simple(self, sample_content_simple):
"""Test semantic variant roundtrip with simple content."""
self._test_variant_roundtrip(ExplodeVariant.SEMANTIC, sample_content_simple)
def test_semantic_variant_roundtrip_complex(self, sample_content_complex):
"""Test semantic variant roundtrip with complex content."""
self._test_variant_roundtrip(ExplodeVariant.SEMANTIC, sample_content_complex)
def _test_variant_roundtrip(self, variant_type: ExplodeVariant, content: str):
"""Generic roundtrip test for any variant."""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
# Step 1: Create original file
original_file = temp_path / f"test_{variant_type.value}.md"
original_file.write_text(content, encoding='utf-8')
# Step 2: Explode the file
variant = create_variant(variant_type)
explode_options = ExplodeOptions(
variant=variant_type,
output_dir=temp_path / f"exploded_{variant_type.value}",
create_manifest=True
)
explode_result = variant.explode(original_file, explode_options)
# Validate explosion was successful
assert explode_result.success, f"Explosion failed: {explode_result.errors}"
assert explode_result.output_directory.exists()
assert explode_result.manifest_path is not None
assert explode_result.manifest_path.exists()
assert len(explode_result.files_created) > 0
# Step 3: Implode the directory back
implode_options = ImplodeOptions(
output_file=temp_path / f"reconstructed_{variant_type.value}.md",
preserve_front_matter=True,
section_spacing=2
)
implode_result = variant.implode(explode_result.output_directory, implode_options)
# Validate implosion was successful
assert implode_result.success, f"Implosion failed: {implode_result.errors}"
assert implode_result.output_file.exists()
assert len(implode_result.files_processed) > 0
# Step 4: Compare original and reconstructed content
reconstructed_content = implode_result.output_file.read_text(encoding='utf-8')
validation = RoundtripValidator.validate_content_preservation(
content, reconstructed_content
)
# Assert key preservation requirements
assert validation['heading_structure_preserved'], \
f"Heading structure not preserved for {variant_type.value} variant"
# Allow for minor formatting differences but require structural integrity
# Note: Front matter and spacing differences can cause small word count variations
assert abs(validation['word_count_original'] - validation['word_count_reconstructed']) <= 15, \
f"Significant word count difference for {variant_type.value} variant"
def test_all_variants_produce_different_structures(self, sample_content_complex):
"""Test that different variants produce different directory structures."""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
original_file = temp_path / "test.md"
original_file.write_text(sample_content_complex, encoding='utf-8')
results = {}
# Explode using each variant
for variant_type in [ExplodeVariant.FLAT, ExplodeVariant.HIERARCHICAL, ExplodeVariant.SEMANTIC]:
variant = create_variant(variant_type)
options = ExplodeOptions(
variant=variant_type,
output_dir=temp_path / f"exploded_{variant_type.value}",
create_manifest=True
)
result = variant.explode(original_file, options)
assert result.success
# Analyze directory structure
subdirs = [d.name for d in result.output_directory.iterdir() if d.is_dir()]
results[variant_type] = {
'subdirs': subdirs,
'subdir_count': len(subdirs),
'files_created': len(result.files_created)
}
# Verify that variants produce different structures
flat_subdirs = set(results[ExplodeVariant.FLAT]['subdirs'])
hierarchical_subdirs = set(results[ExplodeVariant.HIERARCHICAL]['subdirs'])
semantic_subdirs = set(results[ExplodeVariant.SEMANTIC]['subdirs'])
# At least one variant should be different from the others
assert not (flat_subdirs == hierarchical_subdirs == semantic_subdirs), \
"All variants produced identical directory structures"
def test_manifest_enables_accurate_detection(self, sample_content_simple):
"""Test that manifests enable accurate variant detection during implosion."""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
original_file = temp_path / "test.md"
original_file.write_text(sample_content_simple, encoding='utf-8')
factory = get_variant_factory()
# Test each variant
for variant_type in [ExplodeVariant.FLAT, ExplodeVariant.HIERARCHICAL, ExplodeVariant.SEMANTIC]:
# Explode with manifest
variant = create_variant(variant_type)
explode_options = ExplodeOptions(
variant=variant_type,
output_dir=temp_path / f"test_{variant_type.value}",
create_manifest=True
)
explode_result = variant.explode(original_file, explode_options)
assert explode_result.success
# Detect variant from directory
detection_result = factory.detect_variant(explode_result.output_directory)
assert detection_result.variant == variant_type, \
f"Failed to detect {variant_type.value} variant from manifest"
assert detection_result.manifest_found, \
f"Manifest not found for {variant_type.value} variant"
def test_roundtrip_with_front_matter_preservation(self):
"""Test roundtrip with front matter preservation."""
content_with_fm = """---
title: "Test Document"
author: "Test Author"
tags: ["test", "markdown"]
published: 2023-01-01
---
# Main Content
This document has front matter.
## Section 1
Content here.
# Conclusion
End of document.
"""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
original_file = temp_path / "test_fm.md"
original_file.write_text(content_with_fm, encoding='utf-8')
# Test with flat variant (similar for others)
variant = create_variant(ExplodeVariant.FLAT)
explode_options = ExplodeOptions(
variant=ExplodeVariant.FLAT,
preserve_front_matter=True,
create_manifest=True
)
explode_result = variant.explode(original_file, explode_options)
assert explode_result.success
implode_options = ImplodeOptions(
preserve_front_matter=True
)
implode_result = variant.implode(explode_result.output_directory, implode_options)
assert implode_result.success
# Check that front matter is preserved using semantic equivalence
reconstructed_content = implode_result.output_file.read_text(encoding='utf-8')
# Use frontmatter parser to check semantic equivalence
from markitect.matter_frontmatter.parser import FrontmatterParser
parser = FrontmatterParser()
reconstructed_fm = parser.extract_frontmatter(reconstructed_content)
# Check that all expected values are preserved
assert reconstructed_fm.get('title') == 'Test Document'
assert reconstructed_fm.get('author') == 'Test Author'
assert reconstructed_fm.get('tags') == ['test', 'markdown']
# Published date may be parsed as datetime.date object
published = reconstructed_fm.get('published')
assert published is not None, "Published date should be preserved"
# Convert to string for comparison if it's a date object
published_str = str(published) if hasattr(published, 'strftime') else published
assert '2023-01-01' in str(published_str)
def test_roundtrip_error_handling(self):
"""Test roundtrip error handling with malformed content."""
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
# Test with empty file
empty_file = temp_path / "empty.md"
empty_file.write_text("", encoding='utf-8')
variant = create_variant(ExplodeVariant.FLAT)
options = ExplodeOptions(variant=ExplodeVariant.FLAT)
result = variant.explode(empty_file, options)
# Should handle gracefully (may succeed with minimal structure)
assert isinstance(result.success, bool)
# Test with non-existent file
nonexistent_file = temp_path / "nonexistent.md"
result = variant.explode(nonexistent_file, options)
assert not result.success
assert len(result.errors) > 0
class TestRoundtripPerformance:
"""Test performance characteristics of roundtrip operations."""
def test_large_document_roundtrip(self):
"""Test roundtrip with a large document."""
# Generate large content
large_content = "# Introduction\n\nThis is a large document.\n\n"
for i in range(1, 21): # 20 chapters
large_content += f"# Chapter {i}\n\n"
large_content += f"This is chapter {i} content.\n\n"
for j in range(1, 6): # 5 sections per chapter
large_content += f"## Section {i}.{j}\n\n"
large_content += f"Content for section {i}.{j}.\n\n"
large_content += "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " * 10
large_content += "\n\n"
large_content += "# Conclusion\n\nThe end of the document.\n"
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
original_file = temp_path / "large_doc.md"
original_file.write_text(large_content, encoding='utf-8')
# Test with hierarchical variant (most complex)
variant = create_variant(ExplodeVariant.HIERARCHICAL)
explode_options = ExplodeOptions(
variant=ExplodeVariant.HIERARCHICAL,
create_manifest=True
)
explode_result = variant.explode(original_file, explode_options)
assert explode_result.success
implode_options = ImplodeOptions()
implode_result = variant.implode(explode_result.output_directory, implode_options)
assert implode_result.success
# Verify structure preservation
reconstructed_content = implode_result.output_file.read_text(encoding='utf-8')
validation = RoundtripValidator.validate_content_preservation(
large_content, reconstructed_content
)
assert validation['heading_structure_preserved']
if __name__ == '__main__':
pytest.main([__file__, "-v"])