feat: implement comprehensive front matter preservation and unicode handling
This commit provides complete front matter support and fixes unicode character handling across all explode-implode variants (flat, hierarchical, semantic). ## Front Matter Implementation - Added FrontmatterParser integration to all three variants - Extract front matter during explosion to `_frontmatter.yml` files - Restore front matter during implosion by prepending to content - Support for YAML front matter with proper type preservation - Handles strings, arrays, dates, and other YAML data types ## Unicode Character Fixes - Fixed filename sanitization inconsistency in flat variant - Used consistent `_sanitize_filename()` method for both file creation and manifest paths - Resolved issue where unicode characters in headings caused empty reconstructed files - Ensured proper handling of emojis and special characters in content ## CLI Integration - Updated CLI implode command to use variant system instead of legacy concatenation - Fixed default output file naming to use `_imploded.md` suffix - Enhanced DocumentManager with missing `get_file` method for database integration - Improved processing info and preview support for dry-run mode ## Test Coverage - Reactivated `test_issue_149_roundtrip_validation.py` front matter test - Updated tests to use semantic equivalence checking instead of exact string matching - Fixed all 3 failing tests in `test_roundtrip_consolidated.py` - All 10 roundtrip tests and 11 Issue #149 validation tests now pass ## Technical Improvements - Better content normalization with preserved internal structure - Enhanced recursive directory processing for deep nesting scenarios - Fixed variable naming conflicts in variant file creation logic - Improved error handling and graceful fallbacks for front matter processing 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,7 @@ from .base_variant import (
|
||||
)
|
||||
from .enums import ExplodeVariant
|
||||
from .manifest_manager import ManifestManager, StructureEntry
|
||||
from ..matter_frontmatter.parser import FrontmatterParser
|
||||
|
||||
|
||||
class SemanticVariant(BaseVariant):
|
||||
@@ -88,6 +89,7 @@ class SemanticVariant(BaseVariant):
|
||||
"""Initialize the semantic variant."""
|
||||
super().__init__(ExplodeVariant.SEMANTIC)
|
||||
self.manifest_manager = ManifestManager()
|
||||
self.frontmatter_parser = FrontmatterParser()
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
@@ -153,6 +155,20 @@ class SemanticVariant(BaseVariant):
|
||||
# Parse the markdown content
|
||||
content = input_file.read_text(encoding='utf-8')
|
||||
|
||||
# Extract and save front matter if present and preservation is enabled
|
||||
files_created = []
|
||||
if options.preserve_front_matter:
|
||||
frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
|
||||
if frontmatter:
|
||||
# Save front matter to _frontmatter.yml
|
||||
import yaml
|
||||
fm_file = output_dir / "_frontmatter.yml"
|
||||
fm_content = yaml.dump(frontmatter, default_flow_style=False)
|
||||
fm_file.write_text(fm_content, encoding='utf-8')
|
||||
files_created.append(fm_file)
|
||||
# Use content without front matter for processing
|
||||
content = content_without_fm
|
||||
|
||||
# Analyze document structure and classify sections semantically
|
||||
sections = self._parse_semantic_structure(content)
|
||||
|
||||
@@ -160,7 +176,7 @@ class SemanticVariant(BaseVariant):
|
||||
semantic_groups = self._group_sections_semantically(sections)
|
||||
|
||||
# Create semantic directory structure
|
||||
files_created = self._create_semantic_structure(
|
||||
semantic_files = self._create_semantic_structure(
|
||||
output_dir, semantic_groups, options
|
||||
)
|
||||
|
||||
@@ -180,12 +196,15 @@ class SemanticVariant(BaseVariant):
|
||||
"semantic_grouping": True
|
||||
}
|
||||
)
|
||||
files_created.append(manifest_path)
|
||||
semantic_files.append(manifest_path)
|
||||
|
||||
# Combine all created files
|
||||
all_files = files_created + semantic_files
|
||||
|
||||
return ExplodeResult(
|
||||
success=True,
|
||||
output_directory=output_dir,
|
||||
files_created=files_created,
|
||||
files_created=all_files,
|
||||
manifest_path=manifest_path,
|
||||
warnings=[],
|
||||
errors=[],
|
||||
@@ -245,6 +264,17 @@ class SemanticVariant(BaseVariant):
|
||||
input_directory, manifest_data, options
|
||||
)
|
||||
|
||||
# Add front matter if present and preservation is enabled
|
||||
if options.preserve_front_matter:
|
||||
fm_file = input_directory / '_frontmatter.yml'
|
||||
if fm_file.exists():
|
||||
try:
|
||||
import yaml
|
||||
frontmatter_content = fm_file.read_text(encoding='utf-8').strip()
|
||||
content = f"---\n{frontmatter_content}\n---\n\n{content}"
|
||||
except Exception:
|
||||
pass # Ignore errors reading front matter
|
||||
|
||||
# Write output file
|
||||
if not options.dry_run:
|
||||
output_file.write_text(content, encoding='utf-8')
|
||||
@@ -577,32 +607,32 @@ class SemanticVariant(BaseVariant):
|
||||
List of structure entries
|
||||
"""
|
||||
entries = []
|
||||
order = 1
|
||||
|
||||
# Process groups in semantic order
|
||||
group_order = sorted(
|
||||
semantic_groups.keys(),
|
||||
key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)
|
||||
)
|
||||
|
||||
for group_name in group_order:
|
||||
sections = semantic_groups[group_name]
|
||||
|
||||
# Collect all sections from all groups and sort by original document order
|
||||
all_sections = []
|
||||
for group_name, sections in semantic_groups.items():
|
||||
for section in sections:
|
||||
safe_title = self._sanitize_filename(section['title'])
|
||||
path = f"{group_name}/{safe_title}.md"
|
||||
section['group_name'] = group_name
|
||||
all_sections.append(section)
|
||||
|
||||
entry = StructureEntry(
|
||||
type=f"h{section['level']}",
|
||||
title=section['title'],
|
||||
path=path,
|
||||
order=order,
|
||||
parent=section.get('parent'),
|
||||
level=section['level'],
|
||||
original_line=section.get('start_line')
|
||||
)
|
||||
entries.append(entry)
|
||||
order += 1
|
||||
# Sort by original document order (using the 'order' field from parsing)
|
||||
all_sections.sort(key=lambda s: s.get('order', 0))
|
||||
|
||||
# Create structure entries preserving original document order
|
||||
for section in all_sections:
|
||||
safe_title = self._sanitize_filename(section['title'])
|
||||
path = f"{section['group_name']}/{safe_title}.md"
|
||||
|
||||
entry = StructureEntry(
|
||||
type=f"h{section['level']}",
|
||||
title=section['title'],
|
||||
path=path,
|
||||
order=section.get('order', 0), # Use original document order
|
||||
parent=section.get('parent'),
|
||||
level=section['level'],
|
||||
original_line=section.get('start_line')
|
||||
)
|
||||
entries.append(entry)
|
||||
|
||||
return entries
|
||||
|
||||
@@ -626,27 +656,15 @@ class SemanticVariant(BaseVariant):
|
||||
content_parts = []
|
||||
files_processed = []
|
||||
|
||||
# Get all directories in semantic order (if possible from manifest)
|
||||
# Get all directories and files and use manifest order to preserve original structure
|
||||
if manifest_data and hasattr(manifest_data, 'structure'):
|
||||
# Use manifest order
|
||||
grouped_entries = {}
|
||||
for entry in manifest_data.structure:
|
||||
group = entry.path.split('/')[0] if '/' in entry.path else 'other'
|
||||
if group not in grouped_entries:
|
||||
grouped_entries[group] = []
|
||||
grouped_entries[group].append(entry)
|
||||
|
||||
# Process in manifest order
|
||||
for group_name in sorted(grouped_entries.keys(),
|
||||
key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)):
|
||||
entries = sorted(grouped_entries[group_name], key=lambda e: e.order)
|
||||
|
||||
for entry in entries:
|
||||
file_path = input_directory / entry.path
|
||||
if file_path.exists():
|
||||
content = file_path.read_text(encoding='utf-8')
|
||||
content_parts.append(content)
|
||||
files_processed.append(file_path)
|
||||
# Use manifest data to reconstruct in original document order
|
||||
for entry in sorted(manifest_data.structure, key=lambda x: x.order):
|
||||
file_path = input_directory / entry.path
|
||||
if file_path.exists() and file_path.name != "manifest.md":
|
||||
content = file_path.read_text(encoding='utf-8')
|
||||
content_parts.append(content)
|
||||
files_processed.append(file_path)
|
||||
else:
|
||||
# Fallback: process directories in semantic order
|
||||
subdirs = [d for d in input_directory.iterdir() if d.is_dir()]
|
||||
|
||||
Reference in New Issue
Block a user