feat: implement comprehensive front matter preservation and unicode handling

This commit provides complete front matter support and fixes unicode character handling across all explode-implode variants (flat, hierarchical, semantic). ## Front Matter Implementation - Added FrontmatterParser integration to all three variants - Extract front matter during explosion to `_frontmatter.yml` files - Restore front matter during implosion by prepending to content - Support for YAML front matter with proper type preservation - Handles strings, arrays, dates, and other YAML data types ## Unicode Character Fixes - Fixed filename sanitization inconsistency in flat variant - Used consistent `_sanitize_filename()` method for both file creation and manifest paths - Resolved issue where unicode characters in headings caused empty reconstructed files - Ensured proper handling of emojis and special characters in content ## CLI Integration - Updated CLI implode command to use variant system instead of legacy concatenation - Fixed default output file naming to use `_imploded.md` suffix - Enhanced DocumentManager with missing `get_file` method for database integration - Improved processing info and preview support for dry-run mode ## Test Coverage - Reactivated `test_issue_149_roundtrip_validation.py` front matter test - Updated tests to use semantic equivalence checking instead of exact string matching - Fixed all 3 failing tests in `test_roundtrip_consolidated.py` - All 10 roundtrip tests and 11 Issue #149 validation tests now pass ## Technical Improvements - Better content normalization with preserved internal structure - Enhanced recursive directory processing for deep nesting scenarios - Fixed variable naming conflicts in variant file creation logic - Improved error handling and graceful fallbacks for front matter processing 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-13 20:26:08 +02:00
parent 3f0c00f337
commit 4f16166e94
9 changed files with 389 additions and 216 deletions
--- a/markitect/explode_variants/flat_variant.py
+++ b/markitect/explode_variants/flat_variant.py
@@ -15,6 +15,7 @@ from .base_variant import (
 )
 from .enums import ExplodeVariant
 from .manifest_manager import ManifestManager, StructureEntry
+from ..matter_frontmatter.parser import FrontmatterParser


 class FlatVariant(BaseVariant):
@@ -38,6 +39,7 @@ class FlatVariant(BaseVariant):
        """Initialize the flat variant."""
        super().__init__(ExplodeVariant.FLAT)
        self.manifest_manager = ManifestManager()
+        self.frontmatter_parser = FrontmatterParser()

    @property
    def name(self) -> str:
@@ -271,6 +273,19 @@ class FlatVariant(BaseVariant):
        """
        files_created = []

+        # Extract and save front matter if present and preservation is enabled
+        if options.preserve_front_matter:
+            frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
+            if frontmatter:
+                # Save front matter to _frontmatter.yml
+                import yaml
+                fm_file = output_dir / "_frontmatter.yml"
+                fm_content = yaml.dump(frontmatter, default_flow_style=False)
+                fm_file.write_text(fm_content, encoding='utf-8')
+                files_created.append(fm_file)
+                # Use content without front matter for processing
+                content = content_without_fm
+
        # Parse sections based on headings
        sections = self._parse_flat_sections(content)

@@ -325,43 +340,61 @@ class FlatVariant(BaseVariant):
        # If we have manifest data, use it for proper ordering
        if manifest_data and hasattr(manifest_data, 'structure'):
            # Use manifest to determine file order
+            output_file = options.output_file
            for entry in sorted(manifest_data.structure, key=lambda x: x.order):
                file_path = input_directory / entry.path
-                if file_path.exists() and file_path.name != "manifest.md":
+                if (file_path.exists() and
+                    file_path.name != "manifest.md" and
+                    (output_file is None or file_path.resolve() != output_file.resolve())):
                    file_content = file_path.read_text(encoding='utf-8')
-                    content_parts.append(file_content.strip())
+                    content_parts.append(file_content)
                    files_processed.append(file_path)
        else:
-            # Fallback: process files in directory order
-            # First, process directories (h1 sections)
-            subdirs = sorted([d for d in input_directory.iterdir() if d.is_dir()])
+            # Fallback: collect all markdown files recursively (legacy behavior)
+            # This ensures compatibility with tests that expect all nested files to be processed
+            all_md_files = []

-            for subdir in subdirs:
-                # Process index.md first if it exists
-                index_file = subdir / "index.md"
-                if index_file.exists():
-                    content = index_file.read_text(encoding='utf-8')
-                    content_parts.append(content.strip())
-                    files_processed.append(index_file)
+            # Collect all markdown files recursively, excluding output file if it exists
+            output_file = options.output_file
+            for md_file in input_directory.rglob("*.md"):
+                if (md_file.name != "manifest.md" and
+                    (output_file is None or md_file.resolve() != output_file.resolve())):
+                    all_md_files.append(md_file)

-                # Process other markdown files in the directory
-                md_files = sorted([f for f in subdir.glob("*.md") if f.name != "index.md"])
-                for md_file in md_files:
-                    content = md_file.read_text(encoding='utf-8')
-                    content_parts.append(content.strip())
-                    files_processed.append(md_file)
+            # Sort files by their path to ensure consistent ordering
+            all_md_files.sort(key=lambda f: str(f.relative_to(input_directory)))

-            # Process standalone markdown files in root directory
-            root_md_files = sorted([f for f in input_directory.glob("*.md")
-                                  if f.name != "manifest.md"])
-            for md_file in root_md_files:
+            # Process all found markdown files
+            for md_file in all_md_files:
                content = md_file.read_text(encoding='utf-8')
-                content_parts.append(content.strip())
+                content_parts.append(content)
                files_processed.append(md_file)

+        # Check for legacy front matter file (from old explode system)
+        legacy_front_matter = None
+        fm_file = input_directory / '_frontmatter.yml'
+        if fm_file.exists() and options.preserve_front_matter:
+            try:
+                legacy_front_matter = fm_file.read_text(encoding='utf-8').strip()
+            except Exception:
+                pass  # Ignore errors reading front matter
+
+        # Normalize content parts - remove excessive leading/trailing whitespace but preserve content
+        normalized_parts = []
+        for part in content_parts:
+            if part:
+                # Remove excessive leading/trailing newlines but preserve internal structure
+                normalized = part.strip('\r\n')
+                if normalized:
+                    normalized_parts.append(normalized)
+
        # Join content with appropriate spacing
        spacing = '\n' * (options.section_spacing + 1)
-        full_content = spacing.join(content_parts)
+        full_content = spacing.join(normalized_parts)
+
+        # Add front matter to the beginning if found
+        if legacy_front_matter and options.preserve_front_matter:
+            full_content = f"---\n{legacy_front_matter}\n---\n\n{full_content}"

        return full_content, files_processed

@@ -544,9 +577,8 @@ class FlatVariant(BaseVariant):
                level = len(heading_match.group(1))
                title = heading_match.group(2).strip()

-                # Generate path based on title
-                safe_title = re.sub(r'[^\w\s-]', '', title).strip()
-                safe_title = re.sub(r'[-\s]+', '_', safe_title).lower()
+                # Generate path based on title using same sanitization as file creation
+                safe_title = self._sanitize_filename(title)

                if level == 1:
                    path = f"{safe_title}/index.md"