This commit provides complete front matter support and fixes unicode character handling across all explode-implode variants (flat, hierarchical, semantic). ## Front Matter Implementation - Added FrontmatterParser integration to all three variants - Extract front matter during explosion to `_frontmatter.yml` files - Restore front matter during implosion by prepending to content - Support for YAML front matter with proper type preservation - Handles strings, arrays, dates, and other YAML data types ## Unicode Character Fixes - Fixed filename sanitization inconsistency in flat variant - Used consistent `_sanitize_filename()` method for both file creation and manifest paths - Resolved issue where unicode characters in headings caused empty reconstructed files - Ensured proper handling of emojis and special characters in content ## CLI Integration - Updated CLI implode command to use variant system instead of legacy concatenation - Fixed default output file naming to use `_imploded.md` suffix - Enhanced DocumentManager with missing `get_file` method for database integration - Improved processing info and preview support for dry-run mode ## Test Coverage - Reactivated `test_issue_149_roundtrip_validation.py` front matter test - Updated tests to use semantic equivalence checking instead of exact string matching - Fixed all 3 failing tests in `test_roundtrip_consolidated.py` - All 10 roundtrip tests and 11 Issue #149 validation tests now pass ## Technical Improvements - Better content normalization with preserved internal structure - Enhanced recursive directory processing for deep nesting scenarios - Fixed variable naming conflicts in variant file creation logic - Improved error handling and graceful fallbacks for front matter processing 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
554 lines
18 KiB
Python
554 lines
18 KiB
Python
"""
|
|
Roundtrip validation tests for Issue #149 - Explode-Implode Variants
|
|
|
|
Tests that all variants can successfully explode a markdown file and then
|
|
implode it back to produce equivalent content, ensuring full reversibility.
|
|
"""
|
|
|
|
import pytest
|
|
import tempfile
|
|
import re
|
|
from pathlib import Path
|
|
from typing import List, Dict, Any
|
|
|
|
from markitect.explode_variants import (
|
|
ExplodeVariant, ExplodeOptions, ImplodeOptions,
|
|
get_variant_factory, create_variant
|
|
)
|
|
|
|
|
|
class RoundtripValidator:
|
|
"""Helper class for validating explode-implode roundtrips."""
|
|
|
|
@staticmethod
|
|
def normalize_content(content: str) -> str:
|
|
"""
|
|
Normalize markdown content for comparison.
|
|
|
|
Removes excessive whitespace and normalizes line endings.
|
|
"""
|
|
# Normalize line endings
|
|
content = content.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
# Remove excessive blank lines (more than 3 consecutive)
|
|
content = re.sub(r'\n{4,}', '\n\n\n', content)
|
|
|
|
# Strip leading/trailing whitespace
|
|
content = content.strip()
|
|
|
|
return content
|
|
|
|
@staticmethod
|
|
def extract_headings(content: str) -> List[Dict[str, Any]]:
|
|
"""Extract headings with their levels and titles for comparison."""
|
|
headings = []
|
|
lines = content.split('\n')
|
|
|
|
for i, line in enumerate(lines):
|
|
heading_match = re.match(r'^(#{1,6})\s+(.+)', line.strip())
|
|
if heading_match:
|
|
level = len(heading_match.group(1))
|
|
title = heading_match.group(2).strip()
|
|
headings.append({
|
|
'level': level,
|
|
'title': title,
|
|
'line': i + 1
|
|
})
|
|
|
|
return headings
|
|
|
|
@staticmethod
|
|
def validate_heading_structure(original_headings: List[Dict], reconstructed_headings: List[Dict]) -> bool:
|
|
"""Validate that heading structure is preserved."""
|
|
if len(original_headings) != len(reconstructed_headings):
|
|
return False
|
|
|
|
for orig, recon in zip(original_headings, reconstructed_headings):
|
|
if orig['level'] != recon['level'] or orig['title'] != recon['title']:
|
|
return False
|
|
|
|
return True
|
|
|
|
@staticmethod
|
|
def validate_content_preservation(original: str, reconstructed: str) -> Dict[str, Any]:
|
|
"""
|
|
Comprehensive validation of content preservation.
|
|
|
|
Returns validation results with details about any differences.
|
|
"""
|
|
orig_norm = RoundtripValidator.normalize_content(original)
|
|
recon_norm = RoundtripValidator.normalize_content(reconstructed)
|
|
|
|
orig_headings = RoundtripValidator.extract_headings(orig_norm)
|
|
recon_headings = RoundtripValidator.extract_headings(recon_norm)
|
|
|
|
return {
|
|
'exact_match': orig_norm == recon_norm,
|
|
'heading_structure_preserved': RoundtripValidator.validate_heading_structure(orig_headings, recon_headings),
|
|
'original_headings': orig_headings,
|
|
'reconstructed_headings': recon_headings,
|
|
'original_length': len(orig_norm),
|
|
'reconstructed_length': len(recon_norm),
|
|
'word_count_original': len(orig_norm.split()),
|
|
'word_count_reconstructed': len(recon_norm.split())
|
|
}
|
|
|
|
|
|
class TestRoundtripValidation:
|
|
"""Test roundtrip validation for all variants."""
|
|
|
|
@pytest.fixture
|
|
def sample_content_simple(self):
|
|
"""Simple test content."""
|
|
return """# Introduction
|
|
|
|
This is the introduction to the document.
|
|
|
|
## Overview
|
|
|
|
A brief overview of what's covered.
|
|
|
|
## Goals
|
|
|
|
- Goal 1
|
|
- Goal 2
|
|
- Goal 3
|
|
|
|
# Chapter 1: Getting Started
|
|
|
|
Let's begin with the basics.
|
|
|
|
## Installation
|
|
|
|
How to install the software.
|
|
|
|
## Configuration
|
|
|
|
Basic configuration steps.
|
|
|
|
# Chapter 2: Advanced Topics
|
|
|
|
More advanced material.
|
|
|
|
## Performance Optimization
|
|
|
|
Tips for better performance.
|
|
|
|
## Security Considerations
|
|
|
|
Important security notes.
|
|
|
|
# Conclusion
|
|
|
|
Final thoughts and summary.
|
|
"""
|
|
|
|
@pytest.fixture
|
|
def sample_content_complex(self):
|
|
"""Complex test content with various markdown features."""
|
|
return """---
|
|
title: "Comprehensive Guide"
|
|
author: "Test Author"
|
|
version: "1.0"
|
|
---
|
|
|
|
# Introduction
|
|
|
|
Welcome to this **comprehensive guide** with various markdown features.
|
|
|
|
## What You'll Learn
|
|
|
|
- Basic concepts
|
|
- Advanced techniques
|
|
- Best practices
|
|
|
|
### Prerequisites
|
|
|
|
You should have:
|
|
|
|
1. Basic knowledge
|
|
2. Required software
|
|
3. Access to examples
|
|
|
|
# Tutorial: Getting Started
|
|
|
|
This tutorial covers the fundamentals.
|
|
|
|
## Step 1: Installation
|
|
|
|
```bash
|
|
pip install example-package
|
|
```
|
|
|
|
### System Requirements
|
|
|
|
- Python 3.8+
|
|
- 4GB RAM minimum
|
|
- 10GB disk space
|
|
|
|
## Step 2: Configuration
|
|
|
|
Create a configuration file:
|
|
|
|
```yaml
|
|
settings:
|
|
debug: false
|
|
timeout: 30
|
|
```
|
|
|
|
# Reference Manual
|
|
|
|
Complete API documentation.
|
|
|
|
## Core Functions
|
|
|
|
### `initialize()`
|
|
|
|
Initializes the system.
|
|
|
|
**Parameters:**
|
|
- `config`: Configuration object
|
|
- `debug`: Enable debug mode
|
|
|
|
**Returns:**
|
|
- Boolean success status
|
|
|
|
### `process_data(data)`
|
|
|
|
Processes input data.
|
|
|
|
> **Note:** This function is asynchronous.
|
|
|
|
# Appendix A: Troubleshooting
|
|
|
|
Common issues and solutions.
|
|
|
|
## Error Messages
|
|
|
|
### "Connection Failed"
|
|
|
|
Check your network settings.
|
|
|
|
### "Invalid Configuration"
|
|
|
|
Verify your config file syntax.
|
|
|
|
# Appendix B: Examples
|
|
|
|
Code examples and snippets.
|
|
|
|
## Basic Usage
|
|
|
|
```python
|
|
import example
|
|
result = example.process("data")
|
|
```
|
|
|
|
# Conclusion
|
|
|
|
Thank you for reading this guide.
|
|
|
|
## Next Steps
|
|
|
|
1. Try the examples
|
|
2. Read the FAQ
|
|
3. Join the community
|
|
|
|
### Resources
|
|
|
|
- [Documentation](https://docs.example.com)
|
|
- [GitHub](https://github.com/example/repo)
|
|
- [Support](mailto:support@example.com)
|
|
"""
|
|
|
|
def test_flat_variant_roundtrip_simple(self, sample_content_simple):
|
|
"""Test flat variant roundtrip with simple content."""
|
|
self._test_variant_roundtrip(ExplodeVariant.FLAT, sample_content_simple)
|
|
|
|
def test_flat_variant_roundtrip_complex(self, sample_content_complex):
|
|
"""Test flat variant roundtrip with complex content."""
|
|
self._test_variant_roundtrip(ExplodeVariant.FLAT, sample_content_complex)
|
|
|
|
def test_hierarchical_variant_roundtrip_simple(self, sample_content_simple):
|
|
"""Test hierarchical variant roundtrip with simple content."""
|
|
self._test_variant_roundtrip(ExplodeVariant.HIERARCHICAL, sample_content_simple)
|
|
|
|
def test_hierarchical_variant_roundtrip_complex(self, sample_content_complex):
|
|
"""Test hierarchical variant roundtrip with complex content."""
|
|
self._test_variant_roundtrip(ExplodeVariant.HIERARCHICAL, sample_content_complex)
|
|
|
|
def test_semantic_variant_roundtrip_simple(self, sample_content_simple):
|
|
"""Test semantic variant roundtrip with simple content."""
|
|
self._test_variant_roundtrip(ExplodeVariant.SEMANTIC, sample_content_simple)
|
|
|
|
def test_semantic_variant_roundtrip_complex(self, sample_content_complex):
|
|
"""Test semantic variant roundtrip with complex content."""
|
|
self._test_variant_roundtrip(ExplodeVariant.SEMANTIC, sample_content_complex)
|
|
|
|
def _test_variant_roundtrip(self, variant_type: ExplodeVariant, content: str):
|
|
"""Generic roundtrip test for any variant."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
# Step 1: Create original file
|
|
original_file = temp_path / f"test_{variant_type.value}.md"
|
|
original_file.write_text(content, encoding='utf-8')
|
|
|
|
# Step 2: Explode the file
|
|
variant = create_variant(variant_type)
|
|
explode_options = ExplodeOptions(
|
|
variant=variant_type,
|
|
output_dir=temp_path / f"exploded_{variant_type.value}",
|
|
create_manifest=True
|
|
)
|
|
|
|
explode_result = variant.explode(original_file, explode_options)
|
|
|
|
# Validate explosion was successful
|
|
assert explode_result.success, f"Explosion failed: {explode_result.errors}"
|
|
assert explode_result.output_directory.exists()
|
|
assert explode_result.manifest_path is not None
|
|
assert explode_result.manifest_path.exists()
|
|
assert len(explode_result.files_created) > 0
|
|
|
|
# Step 3: Implode the directory back
|
|
implode_options = ImplodeOptions(
|
|
output_file=temp_path / f"reconstructed_{variant_type.value}.md",
|
|
preserve_front_matter=True,
|
|
section_spacing=2
|
|
)
|
|
|
|
implode_result = variant.implode(explode_result.output_directory, implode_options)
|
|
|
|
# Validate implosion was successful
|
|
assert implode_result.success, f"Implosion failed: {implode_result.errors}"
|
|
assert implode_result.output_file.exists()
|
|
assert len(implode_result.files_processed) > 0
|
|
|
|
# Step 4: Compare original and reconstructed content
|
|
reconstructed_content = implode_result.output_file.read_text(encoding='utf-8')
|
|
|
|
validation = RoundtripValidator.validate_content_preservation(
|
|
content, reconstructed_content
|
|
)
|
|
|
|
# Assert key preservation requirements
|
|
assert validation['heading_structure_preserved'], \
|
|
f"Heading structure not preserved for {variant_type.value} variant"
|
|
|
|
# Allow for minor formatting differences but require structural integrity
|
|
# Note: Front matter and spacing differences can cause small word count variations
|
|
assert abs(validation['word_count_original'] - validation['word_count_reconstructed']) <= 15, \
|
|
f"Significant word count difference for {variant_type.value} variant"
|
|
|
|
def test_all_variants_produce_different_structures(self, sample_content_complex):
|
|
"""Test that different variants produce different directory structures."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
original_file = temp_path / "test.md"
|
|
original_file.write_text(sample_content_complex, encoding='utf-8')
|
|
|
|
results = {}
|
|
|
|
# Explode using each variant
|
|
for variant_type in [ExplodeVariant.FLAT, ExplodeVariant.HIERARCHICAL, ExplodeVariant.SEMANTIC]:
|
|
variant = create_variant(variant_type)
|
|
options = ExplodeOptions(
|
|
variant=variant_type,
|
|
output_dir=temp_path / f"exploded_{variant_type.value}",
|
|
create_manifest=True
|
|
)
|
|
|
|
result = variant.explode(original_file, options)
|
|
assert result.success
|
|
|
|
# Analyze directory structure
|
|
subdirs = [d.name for d in result.output_directory.iterdir() if d.is_dir()]
|
|
results[variant_type] = {
|
|
'subdirs': subdirs,
|
|
'subdir_count': len(subdirs),
|
|
'files_created': len(result.files_created)
|
|
}
|
|
|
|
# Verify that variants produce different structures
|
|
flat_subdirs = set(results[ExplodeVariant.FLAT]['subdirs'])
|
|
hierarchical_subdirs = set(results[ExplodeVariant.HIERARCHICAL]['subdirs'])
|
|
semantic_subdirs = set(results[ExplodeVariant.SEMANTIC]['subdirs'])
|
|
|
|
# At least one variant should be different from the others
|
|
assert not (flat_subdirs == hierarchical_subdirs == semantic_subdirs), \
|
|
"All variants produced identical directory structures"
|
|
|
|
def test_manifest_enables_accurate_detection(self, sample_content_simple):
|
|
"""Test that manifests enable accurate variant detection during implosion."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
original_file = temp_path / "test.md"
|
|
original_file.write_text(sample_content_simple, encoding='utf-8')
|
|
|
|
factory = get_variant_factory()
|
|
|
|
# Test each variant
|
|
for variant_type in [ExplodeVariant.FLAT, ExplodeVariant.HIERARCHICAL, ExplodeVariant.SEMANTIC]:
|
|
# Explode with manifest
|
|
variant = create_variant(variant_type)
|
|
explode_options = ExplodeOptions(
|
|
variant=variant_type,
|
|
output_dir=temp_path / f"test_{variant_type.value}",
|
|
create_manifest=True
|
|
)
|
|
|
|
explode_result = variant.explode(original_file, explode_options)
|
|
assert explode_result.success
|
|
|
|
# Detect variant from directory
|
|
detection_result = factory.detect_variant(explode_result.output_directory)
|
|
|
|
assert detection_result.variant == variant_type, \
|
|
f"Failed to detect {variant_type.value} variant from manifest"
|
|
assert detection_result.manifest_found, \
|
|
f"Manifest not found for {variant_type.value} variant"
|
|
|
|
def test_roundtrip_with_front_matter_preservation(self):
|
|
"""Test roundtrip with front matter preservation."""
|
|
content_with_fm = """---
|
|
title: "Test Document"
|
|
author: "Test Author"
|
|
tags: ["test", "markdown"]
|
|
published: 2023-01-01
|
|
---
|
|
|
|
# Main Content
|
|
|
|
This document has front matter.
|
|
|
|
## Section 1
|
|
|
|
Content here.
|
|
|
|
# Conclusion
|
|
|
|
End of document.
|
|
"""
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
original_file = temp_path / "test_fm.md"
|
|
original_file.write_text(content_with_fm, encoding='utf-8')
|
|
|
|
# Test with flat variant (similar for others)
|
|
variant = create_variant(ExplodeVariant.FLAT)
|
|
|
|
explode_options = ExplodeOptions(
|
|
variant=ExplodeVariant.FLAT,
|
|
preserve_front_matter=True,
|
|
create_manifest=True
|
|
)
|
|
|
|
explode_result = variant.explode(original_file, explode_options)
|
|
assert explode_result.success
|
|
|
|
implode_options = ImplodeOptions(
|
|
preserve_front_matter=True
|
|
)
|
|
|
|
implode_result = variant.implode(explode_result.output_directory, implode_options)
|
|
assert implode_result.success
|
|
|
|
# Check that front matter is preserved using semantic equivalence
|
|
reconstructed_content = implode_result.output_file.read_text(encoding='utf-8')
|
|
|
|
# Use frontmatter parser to check semantic equivalence
|
|
from markitect.matter_frontmatter.parser import FrontmatterParser
|
|
parser = FrontmatterParser()
|
|
reconstructed_fm = parser.extract_frontmatter(reconstructed_content)
|
|
|
|
# Check that all expected values are preserved
|
|
assert reconstructed_fm.get('title') == 'Test Document'
|
|
assert reconstructed_fm.get('author') == 'Test Author'
|
|
assert reconstructed_fm.get('tags') == ['test', 'markdown']
|
|
# Published date may be parsed as datetime.date object
|
|
published = reconstructed_fm.get('published')
|
|
assert published is not None, "Published date should be preserved"
|
|
# Convert to string for comparison if it's a date object
|
|
published_str = str(published) if hasattr(published, 'strftime') else published
|
|
assert '2023-01-01' in str(published_str)
|
|
|
|
def test_roundtrip_error_handling(self):
|
|
"""Test roundtrip error handling with malformed content."""
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
# Test with empty file
|
|
empty_file = temp_path / "empty.md"
|
|
empty_file.write_text("", encoding='utf-8')
|
|
|
|
variant = create_variant(ExplodeVariant.FLAT)
|
|
options = ExplodeOptions(variant=ExplodeVariant.FLAT)
|
|
|
|
result = variant.explode(empty_file, options)
|
|
# Should handle gracefully (may succeed with minimal structure)
|
|
assert isinstance(result.success, bool)
|
|
|
|
# Test with non-existent file
|
|
nonexistent_file = temp_path / "nonexistent.md"
|
|
result = variant.explode(nonexistent_file, options)
|
|
assert not result.success
|
|
assert len(result.errors) > 0
|
|
|
|
|
|
class TestRoundtripPerformance:
|
|
"""Test performance characteristics of roundtrip operations."""
|
|
|
|
def test_large_document_roundtrip(self):
|
|
"""Test roundtrip with a large document."""
|
|
# Generate large content
|
|
large_content = "# Introduction\n\nThis is a large document.\n\n"
|
|
|
|
for i in range(1, 21): # 20 chapters
|
|
large_content += f"# Chapter {i}\n\n"
|
|
large_content += f"This is chapter {i} content.\n\n"
|
|
|
|
for j in range(1, 6): # 5 sections per chapter
|
|
large_content += f"## Section {i}.{j}\n\n"
|
|
large_content += f"Content for section {i}.{j}.\n\n"
|
|
large_content += "Lorem ipsum dolor sit amet, consectetur adipiscing elit. " * 10
|
|
large_content += "\n\n"
|
|
|
|
large_content += "# Conclusion\n\nThe end of the document.\n"
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
temp_path = Path(temp_dir)
|
|
|
|
original_file = temp_path / "large_doc.md"
|
|
original_file.write_text(large_content, encoding='utf-8')
|
|
|
|
# Test with hierarchical variant (most complex)
|
|
variant = create_variant(ExplodeVariant.HIERARCHICAL)
|
|
|
|
explode_options = ExplodeOptions(
|
|
variant=ExplodeVariant.HIERARCHICAL,
|
|
create_manifest=True
|
|
)
|
|
|
|
explode_result = variant.explode(original_file, explode_options)
|
|
assert explode_result.success
|
|
|
|
implode_options = ImplodeOptions()
|
|
implode_result = variant.implode(explode_result.output_directory, implode_options)
|
|
assert implode_result.success
|
|
|
|
# Verify structure preservation
|
|
reconstructed_content = implode_result.output_file.read_text(encoding='utf-8')
|
|
validation = RoundtripValidator.validate_content_preservation(
|
|
large_content, reconstructed_content
|
|
)
|
|
|
|
assert validation['heading_structure_preserved']
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, "-v"]) |