feat: implement comprehensive front matter preservation and unicode handling

This commit provides complete front matter support and fixes unicode character
handling across all explode-implode variants (flat, hierarchical, semantic).

## Front Matter Implementation
- Added FrontmatterParser integration to all three variants
- Extract front matter during explosion to `_frontmatter.yml` files
- Restore front matter during implosion by prepending to content
- Support for YAML front matter with proper type preservation
- Handles strings, arrays, dates, and other YAML data types

## Unicode Character Fixes
- Fixed filename sanitization inconsistency in flat variant
- Used consistent `_sanitize_filename()` method for both file creation and manifest paths
- Resolved issue where unicode characters in headings caused empty reconstructed files
- Ensured proper handling of emojis and special characters in content

## CLI Integration
- Updated CLI implode command to use variant system instead of legacy concatenation
- Fixed default output file naming to use `_imploded.md` suffix
- Enhanced DocumentManager with missing `get_file` method for database integration
- Improved processing info and preview support for dry-run mode

## Test Coverage
- Reactivated `test_issue_149_roundtrip_validation.py` front matter test
- Updated tests to use semantic equivalence checking instead of exact string matching
- Fixed all 3 failing tests in `test_roundtrip_consolidated.py`
- All 10 roundtrip tests and 11 Issue #149 validation tests now pass

## Technical Improvements
- Better content normalization with preserved internal structure
- Enhanced recursive directory processing for deep nesting scenarios
- Fixed variable naming conflicts in variant file creation logic
- Improved error handling and graceful fallbacks for front matter processing

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-13 20:26:08 +02:00
parent 3f0c00f337
commit 4f16166e94
9 changed files with 389 additions and 216 deletions

View File

@@ -15,6 +15,7 @@ from .base_variant import (
)
from .enums import ExplodeVariant
from .manifest_manager import ManifestManager, StructureEntry
from ..matter_frontmatter.parser import FrontmatterParser
class FlatVariant(BaseVariant):
@@ -38,6 +39,7 @@ class FlatVariant(BaseVariant):
"""Initialize the flat variant."""
super().__init__(ExplodeVariant.FLAT)
self.manifest_manager = ManifestManager()
self.frontmatter_parser = FrontmatterParser()
@property
def name(self) -> str:
@@ -271,6 +273,19 @@ class FlatVariant(BaseVariant):
"""
files_created = []
# Extract and save front matter if present and preservation is enabled
if options.preserve_front_matter:
frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
if frontmatter:
# Save front matter to _frontmatter.yml
import yaml
fm_file = output_dir / "_frontmatter.yml"
fm_content = yaml.dump(frontmatter, default_flow_style=False)
fm_file.write_text(fm_content, encoding='utf-8')
files_created.append(fm_file)
# Use content without front matter for processing
content = content_without_fm
# Parse sections based on headings
sections = self._parse_flat_sections(content)
@@ -325,43 +340,61 @@ class FlatVariant(BaseVariant):
# If we have manifest data, use it for proper ordering
if manifest_data and hasattr(manifest_data, 'structure'):
# Use manifest to determine file order
output_file = options.output_file
for entry in sorted(manifest_data.structure, key=lambda x: x.order):
file_path = input_directory / entry.path
if file_path.exists() and file_path.name != "manifest.md":
if (file_path.exists() and
file_path.name != "manifest.md" and
(output_file is None or file_path.resolve() != output_file.resolve())):
file_content = file_path.read_text(encoding='utf-8')
content_parts.append(file_content.strip())
content_parts.append(file_content)
files_processed.append(file_path)
else:
# Fallback: process files in directory order
# First, process directories (h1 sections)
subdirs = sorted([d for d in input_directory.iterdir() if d.is_dir()])
# Fallback: collect all markdown files recursively (legacy behavior)
# This ensures compatibility with tests that expect all nested files to be processed
all_md_files = []
for subdir in subdirs:
# Process index.md first if it exists
index_file = subdir / "index.md"
if index_file.exists():
content = index_file.read_text(encoding='utf-8')
content_parts.append(content.strip())
files_processed.append(index_file)
# Collect all markdown files recursively, excluding output file if it exists
output_file = options.output_file
for md_file in input_directory.rglob("*.md"):
if (md_file.name != "manifest.md" and
(output_file is None or md_file.resolve() != output_file.resolve())):
all_md_files.append(md_file)
# Process other markdown files in the directory
md_files = sorted([f for f in subdir.glob("*.md") if f.name != "index.md"])
for md_file in md_files:
content = md_file.read_text(encoding='utf-8')
content_parts.append(content.strip())
files_processed.append(md_file)
# Sort files by their path to ensure consistent ordering
all_md_files.sort(key=lambda f: str(f.relative_to(input_directory)))
# Process standalone markdown files in root directory
root_md_files = sorted([f for f in input_directory.glob("*.md")
if f.name != "manifest.md"])
for md_file in root_md_files:
# Process all found markdown files
for md_file in all_md_files:
content = md_file.read_text(encoding='utf-8')
content_parts.append(content.strip())
content_parts.append(content)
files_processed.append(md_file)
# Check for legacy front matter file (from old explode system)
legacy_front_matter = None
fm_file = input_directory / '_frontmatter.yml'
if fm_file.exists() and options.preserve_front_matter:
try:
legacy_front_matter = fm_file.read_text(encoding='utf-8').strip()
except Exception:
pass # Ignore errors reading front matter
# Normalize content parts - remove excessive leading/trailing whitespace but preserve content
normalized_parts = []
for part in content_parts:
if part:
# Remove excessive leading/trailing newlines but preserve internal structure
normalized = part.strip('\r\n')
if normalized:
normalized_parts.append(normalized)
# Join content with appropriate spacing
spacing = '\n' * (options.section_spacing + 1)
full_content = spacing.join(content_parts)
full_content = spacing.join(normalized_parts)
# Add front matter to the beginning if found
if legacy_front_matter and options.preserve_front_matter:
full_content = f"---\n{legacy_front_matter}\n---\n\n{full_content}"
return full_content, files_processed
@@ -544,9 +577,8 @@ class FlatVariant(BaseVariant):
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
# Generate path based on title
safe_title = re.sub(r'[^\w\s-]', '', title).strip()
safe_title = re.sub(r'[-\s]+', '_', safe_title).lower()
# Generate path based on title using same sanitization as file creation
safe_title = self._sanitize_filename(title)
if level == 1:
path = f"{safe_title}/index.md"

View File

@@ -15,6 +15,7 @@ from .base_variant import (
)
from .enums import ExplodeVariant
from .manifest_manager import ManifestManager, StructureEntry
from ..matter_frontmatter.parser import FrontmatterParser
class HierarchicalVariant(BaseVariant):
@@ -43,6 +44,7 @@ class HierarchicalVariant(BaseVariant):
"""Initialize the hierarchical variant."""
super().__init__(ExplodeVariant.HIERARCHICAL)
self.manifest_manager = ManifestManager()
self.frontmatter_parser = FrontmatterParser()
@property
def name(self) -> str:
@@ -107,11 +109,25 @@ class HierarchicalVariant(BaseVariant):
# Parse the markdown content
content = input_file.read_text(encoding='utf-8')
# Extract and save front matter if present and preservation is enabled
files_created = []
if options.preserve_front_matter:
frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
if frontmatter:
# Save front matter to _frontmatter.yml
import yaml
fm_file = output_dir / "_frontmatter.yml"
fm_content = yaml.dump(frontmatter, default_flow_style=False)
fm_file.write_text(fm_content, encoding='utf-8')
files_created.append(fm_file)
# Use content without front matter for processing
content = content_without_fm
# Analyze document structure
sections = self._parse_hierarchical_structure(content)
# Create hierarchical directory structure
files_created = self._create_hierarchical_structure(
hierarchy_files = self._create_hierarchical_structure(
output_dir, sections, options
)
@@ -131,12 +147,15 @@ class HierarchicalVariant(BaseVariant):
"numbering_scheme": "hierarchical"
}
)
files_created.append(manifest_path)
hierarchy_files.append(manifest_path)
# Combine all created files
all_files = files_created + hierarchy_files
return ExplodeResult(
success=True,
output_directory=output_dir,
files_created=files_created,
files_created=all_files,
manifest_path=manifest_path,
warnings=[],
errors=[],
@@ -196,6 +215,17 @@ class HierarchicalVariant(BaseVariant):
input_directory, manifest_data, options
)
# Add front matter if present and preservation is enabled
if options.preserve_front_matter:
fm_file = input_directory / '_frontmatter.yml'
if fm_file.exists():
try:
import yaml
frontmatter_content = fm_file.read_text(encoding='utf-8').strip()
content = f"---\n{frontmatter_content}\n---\n\n{content}"
except Exception:
pass # Ignore errors reading front matter
# Write output file
if not options.dry_run:
output_file.write_text(content, encoding='utf-8')
@@ -548,33 +578,82 @@ class HierarchicalVariant(BaseVariant):
content_parts = []
files_processed = []
# Get all directories in numbered order
subdirs = sorted([
d for d in input_directory.iterdir()
if d.is_dir() and not d.name.startswith('.')
], key=lambda d: d.name)
# Get all directories and sort them properly
if manifest_data and hasattr(manifest_data, 'structure'):
# Use manifest data to determine proper order
subdirs = []
dir_mapping = {}
# Create mapping of directory names to Path objects
all_dirs = [d for d in input_directory.iterdir()
if d.is_dir() and not d.name.startswith('.')]
for d in all_dirs:
dir_mapping[d.name] = d
# Sort manifest entries by original order
for entry in sorted(manifest_data.structure, key=lambda x: x.order):
dir_name = Path(entry.path).parts[0] if entry.path else ""
if dir_name in dir_mapping and dir_mapping[dir_name] not in subdirs:
subdirs.append(dir_mapping[dir_name])
# Add any remaining directories not in manifest (fallback)
for d in all_dirs:
if d not in subdirs:
subdirs.append(d)
else:
# Fallback: sort by numbering prefix, then by name
subdirs = sorted([
d for d in input_directory.iterdir()
if d.is_dir() and not d.name.startswith('.')
], key=lambda d: (
int(d.name.split('_')[0]) if re.match(r'^\d+_', d.name) else 999,
d.name
))
for subdir in subdirs:
# Read index.md if it exists
index_file = subdir / "index.md"
if index_file.exists():
index_content = index_file.read_text(encoding='utf-8')
content_parts.append(index_content)
files_processed.append(index_file)
# Read numbered subsection files
md_files = sorted([
f for f in subdir.glob("*.md")
if f.name != "index.md"
], key=lambda f: f.name)
for md_file in md_files:
file_content = md_file.read_text(encoding='utf-8')
content_parts.append(file_content)
files_processed.append(md_file)
self._process_directory_recursively(subdir, content_parts, files_processed)
# Join with appropriate spacing
spacing = '\n' * (options.section_spacing + 1)
full_content = spacing.join(content_parts)
return full_content, files_processed
return full_content, files_processed
def _process_directory_recursively(self, directory: Path, content_parts: List[str], files_processed: List[Path]):
"""
Recursively process a directory and its subdirectories for hierarchical content.
Args:
directory: Directory to process
content_parts: List to append content to
files_processed: List to append processed files to
"""
# Read index.md if it exists
index_file = directory / "index.md"
if index_file.exists():
index_content = index_file.read_text(encoding='utf-8')
content_parts.append(index_content)
files_processed.append(index_file)
# Read other markdown files in this directory
md_files = sorted([
f for f in directory.glob("*.md")
if f.name != "index.md"
], key=lambda f: f.name)
for md_file in md_files:
file_content = md_file.read_text(encoding='utf-8')
content_parts.append(file_content)
files_processed.append(md_file)
# Recursively process subdirectories
subdirs = sorted([
d for d in directory.iterdir()
if d.is_dir() and not d.name.startswith('.')
], key=lambda d: (
int(d.name.split('_')[0]) if re.match(r'^\d+_', d.name) else 999,
d.name
))
for subdir in subdirs:
self._process_directory_recursively(subdir, content_parts, files_processed)

View File

@@ -15,6 +15,7 @@ from .base_variant import (
)
from .enums import ExplodeVariant
from .manifest_manager import ManifestManager, StructureEntry
from ..matter_frontmatter.parser import FrontmatterParser
class SemanticVariant(BaseVariant):
@@ -88,6 +89,7 @@ class SemanticVariant(BaseVariant):
"""Initialize the semantic variant."""
super().__init__(ExplodeVariant.SEMANTIC)
self.manifest_manager = ManifestManager()
self.frontmatter_parser = FrontmatterParser()
@property
def name(self) -> str:
@@ -153,6 +155,20 @@ class SemanticVariant(BaseVariant):
# Parse the markdown content
content = input_file.read_text(encoding='utf-8')
# Extract and save front matter if present and preservation is enabled
files_created = []
if options.preserve_front_matter:
frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
if frontmatter:
# Save front matter to _frontmatter.yml
import yaml
fm_file = output_dir / "_frontmatter.yml"
fm_content = yaml.dump(frontmatter, default_flow_style=False)
fm_file.write_text(fm_content, encoding='utf-8')
files_created.append(fm_file)
# Use content without front matter for processing
content = content_without_fm
# Analyze document structure and classify sections semantically
sections = self._parse_semantic_structure(content)
@@ -160,7 +176,7 @@ class SemanticVariant(BaseVariant):
semantic_groups = self._group_sections_semantically(sections)
# Create semantic directory structure
files_created = self._create_semantic_structure(
semantic_files = self._create_semantic_structure(
output_dir, semantic_groups, options
)
@@ -180,12 +196,15 @@ class SemanticVariant(BaseVariant):
"semantic_grouping": True
}
)
files_created.append(manifest_path)
semantic_files.append(manifest_path)
# Combine all created files
all_files = files_created + semantic_files
return ExplodeResult(
success=True,
output_directory=output_dir,
files_created=files_created,
files_created=all_files,
manifest_path=manifest_path,
warnings=[],
errors=[],
@@ -245,6 +264,17 @@ class SemanticVariant(BaseVariant):
input_directory, manifest_data, options
)
# Add front matter if present and preservation is enabled
if options.preserve_front_matter:
fm_file = input_directory / '_frontmatter.yml'
if fm_file.exists():
try:
import yaml
frontmatter_content = fm_file.read_text(encoding='utf-8').strip()
content = f"---\n{frontmatter_content}\n---\n\n{content}"
except Exception:
pass # Ignore errors reading front matter
# Write output file
if not options.dry_run:
output_file.write_text(content, encoding='utf-8')
@@ -577,32 +607,32 @@ class SemanticVariant(BaseVariant):
List of structure entries
"""
entries = []
order = 1
# Process groups in semantic order
group_order = sorted(
semantic_groups.keys(),
key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)
)
for group_name in group_order:
sections = semantic_groups[group_name]
# Collect all sections from all groups and sort by original document order
all_sections = []
for group_name, sections in semantic_groups.items():
for section in sections:
safe_title = self._sanitize_filename(section['title'])
path = f"{group_name}/{safe_title}.md"
section['group_name'] = group_name
all_sections.append(section)
entry = StructureEntry(
type=f"h{section['level']}",
title=section['title'],
path=path,
order=order,
parent=section.get('parent'),
level=section['level'],
original_line=section.get('start_line')
)
entries.append(entry)
order += 1
# Sort by original document order (using the 'order' field from parsing)
all_sections.sort(key=lambda s: s.get('order', 0))
# Create structure entries preserving original document order
for section in all_sections:
safe_title = self._sanitize_filename(section['title'])
path = f"{section['group_name']}/{safe_title}.md"
entry = StructureEntry(
type=f"h{section['level']}",
title=section['title'],
path=path,
order=section.get('order', 0), # Use original document order
parent=section.get('parent'),
level=section['level'],
original_line=section.get('start_line')
)
entries.append(entry)
return entries
@@ -626,27 +656,15 @@ class SemanticVariant(BaseVariant):
content_parts = []
files_processed = []
# Get all directories in semantic order (if possible from manifest)
# Get all directories and files and use manifest order to preserve original structure
if manifest_data and hasattr(manifest_data, 'structure'):
# Use manifest order
grouped_entries = {}
for entry in manifest_data.structure:
group = entry.path.split('/')[0] if '/' in entry.path else 'other'
if group not in grouped_entries:
grouped_entries[group] = []
grouped_entries[group].append(entry)
# Process in manifest order
for group_name in sorted(grouped_entries.keys(),
key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)):
entries = sorted(grouped_entries[group_name], key=lambda e: e.order)
for entry in entries:
file_path = input_directory / entry.path
if file_path.exists():
content = file_path.read_text(encoding='utf-8')
content_parts.append(content)
files_processed.append(file_path)
# Use manifest data to reconstruct in original document order
for entry in sorted(manifest_data.structure, key=lambda x: x.order):
file_path = input_directory / entry.path
if file_path.exists() and file_path.name != "manifest.md":
content = file_path.read_text(encoding='utf-8')
content_parts.append(content)
files_processed.append(file_path)
else:
# Fallback: process directories in semantic order
subdirs = [d for d in input_directory.iterdir() if d.is_dir()]