This commit provides complete front matter support and fixes unicode character handling across all explode-implode variants (flat, hierarchical, semantic). ## Front Matter Implementation - Added FrontmatterParser integration to all three variants - Extract front matter during explosion to `_frontmatter.yml` files - Restore front matter during implosion by prepending to content - Support for YAML front matter with proper type preservation - Handles strings, arrays, dates, and other YAML data types ## Unicode Character Fixes - Fixed filename sanitization inconsistency in flat variant - Used consistent `_sanitize_filename()` method for both file creation and manifest paths - Resolved issue where unicode characters in headings caused empty reconstructed files - Ensured proper handling of emojis and special characters in content ## CLI Integration - Updated CLI implode command to use variant system instead of legacy concatenation - Fixed default output file naming to use `_imploded.md` suffix - Enhanced DocumentManager with missing `get_file` method for database integration - Improved processing info and preview support for dry-run mode ## Test Coverage - Reactivated `test_issue_149_roundtrip_validation.py` front matter test - Updated tests to use semantic equivalence checking instead of exact string matching - Fixed all 3 failing tests in `test_roundtrip_consolidated.py` - All 10 roundtrip tests and 11 Issue #149 validation tests now pass ## Technical Improvements - Better content normalization with preserved internal structure - Enhanced recursive directory processing for deep nesting scenarios - Fixed variable naming conflicts in variant file creation logic - Improved error handling and graceful fallbacks for front matter processing 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
598 lines
22 KiB
Python
598 lines
22 KiB
Python
"""
|
|
Flat variant implementation for explode-implode operations.
|
|
|
|
This variant represents the current default behavior where h1 headings
|
|
become top-level directories with content organized beneath them.
|
|
"""
|
|
|
|
import re
|
|
from pathlib import Path
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
from .base_variant import (
|
|
BaseVariant, ExplodeOptions, ImplodeOptions,
|
|
ExplodeResult, ImplodeResult
|
|
)
|
|
from .enums import ExplodeVariant
|
|
from .manifest_manager import ManifestManager, StructureEntry
|
|
from ..matter_frontmatter.parser import FrontmatterParser
|
|
|
|
|
|
class FlatVariant(BaseVariant):
|
|
"""
|
|
Flat variant implementation.
|
|
|
|
Creates directories based on h1 headings with nested content.
|
|
This is the current default behavior for backward compatibility.
|
|
|
|
Structure example:
|
|
book.mdd/
|
|
├── manifest.md
|
|
├── book_title/
|
|
│ ├── index.md
|
|
│ ├── chapter_1.md
|
|
│ └── chapter_2.md
|
|
└── conclusion.md
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the flat variant."""
|
|
super().__init__(ExplodeVariant.FLAT)
|
|
self.manifest_manager = ManifestManager()
|
|
self.frontmatter_parser = FrontmatterParser()
|
|
|
|
@property
|
|
def name(self) -> str:
|
|
"""Human-readable name of the variant."""
|
|
return "Flat Structure"
|
|
|
|
@property
|
|
def description(self) -> str:
|
|
"""Description of the variant's behavior."""
|
|
return ("Creates directories based on h1 headings with content organized beneath them. "
|
|
"This is the default structure for backward compatibility.")
|
|
|
|
def explode(
|
|
self,
|
|
input_file: Path,
|
|
options: ExplodeOptions
|
|
) -> ExplodeResult:
|
|
"""
|
|
Explode a markdown file using the flat structure variant.
|
|
|
|
Args:
|
|
input_file: Path to the markdown file to explode
|
|
options: Options controlling the explode operation
|
|
|
|
Returns:
|
|
Result of the explode operation
|
|
"""
|
|
# Validate input
|
|
validation_errors = self.validate_input_file(input_file)
|
|
if validation_errors:
|
|
return ExplodeResult(
|
|
success=False,
|
|
output_directory=options.output_dir or Path(),
|
|
files_created=[],
|
|
manifest_path=None,
|
|
warnings=[],
|
|
errors=validation_errors,
|
|
variant_used=self.variant_type
|
|
)
|
|
|
|
# Determine output directory
|
|
if options.output_dir:
|
|
output_dir = options.output_dir
|
|
else:
|
|
suffix = ".mdd" if options.create_manifest else "_exploded"
|
|
output_dir = input_file.parent / f"{input_file.stem}{suffix}"
|
|
|
|
# Create output directory
|
|
creation_errors = self.create_output_directory(output_dir, overwrite=True)
|
|
if creation_errors:
|
|
return ExplodeResult(
|
|
success=False,
|
|
output_directory=output_dir,
|
|
files_created=[],
|
|
manifest_path=None,
|
|
warnings=[],
|
|
errors=creation_errors,
|
|
variant_used=self.variant_type
|
|
)
|
|
|
|
try:
|
|
# Parse the markdown content
|
|
content = input_file.read_text(encoding='utf-8')
|
|
|
|
# Implement flat explode logic directly
|
|
files_created = self._explode_flat_structure(
|
|
input_file, output_dir, content, options
|
|
)
|
|
|
|
# Create manifest if requested
|
|
manifest_path = None
|
|
if options.create_manifest:
|
|
structure = self._analyze_structure(content, output_dir)
|
|
manifest_path = self.manifest_manager.create_manifest(
|
|
output_dir=output_dir,
|
|
original_file=input_file,
|
|
variant=self.variant_type,
|
|
structure=structure,
|
|
preservation_options={
|
|
"front_matter": options.preserve_front_matter,
|
|
"section_order": True,
|
|
"heading_levels": True
|
|
}
|
|
)
|
|
files_created.append(manifest_path)
|
|
|
|
return ExplodeResult(
|
|
success=True,
|
|
output_directory=output_dir,
|
|
files_created=files_created,
|
|
manifest_path=manifest_path,
|
|
warnings=[],
|
|
errors=[],
|
|
variant_used=self.variant_type
|
|
)
|
|
|
|
except Exception as e:
|
|
return ExplodeResult(
|
|
success=False,
|
|
output_directory=output_dir,
|
|
files_created=[],
|
|
manifest_path=None,
|
|
warnings=[],
|
|
errors=[f"Error during explosion: {e}"],
|
|
variant_used=self.variant_type
|
|
)
|
|
|
|
def implode(
|
|
self,
|
|
input_directory: Path,
|
|
options: ImplodeOptions
|
|
) -> ImplodeResult:
|
|
"""
|
|
Implode a directory structure back into a markdown file.
|
|
|
|
Args:
|
|
input_directory: Path to the directory to implode
|
|
options: Options controlling the implode operation
|
|
|
|
Returns:
|
|
Result of the implode operation
|
|
"""
|
|
# Validate input
|
|
validation_errors = self.validate_input_directory(input_directory)
|
|
if validation_errors:
|
|
return ImplodeResult(
|
|
success=False,
|
|
output_file=options.output_file or Path(),
|
|
files_processed=[],
|
|
variant_detected=self.variant_type,
|
|
warnings=[],
|
|
errors=validation_errors
|
|
)
|
|
|
|
# Determine output file
|
|
if options.output_file:
|
|
output_file = options.output_file
|
|
else:
|
|
output_file = input_directory.parent / f"{input_directory.name}_imploded.md"
|
|
|
|
try:
|
|
# Read manifest if available
|
|
manifest_data = self.manifest_manager.read_manifest(input_directory)
|
|
|
|
# Implement flat implode logic directly
|
|
content, files_processed = self._implode_flat_structure(
|
|
input_directory, manifest_data, options
|
|
)
|
|
|
|
# Write output file
|
|
if not options.dry_run:
|
|
output_file.write_text(content, encoding='utf-8')
|
|
|
|
return ImplodeResult(
|
|
success=True,
|
|
output_file=output_file,
|
|
files_processed=files_processed,
|
|
variant_detected=self.variant_type,
|
|
warnings=[],
|
|
errors=[]
|
|
)
|
|
|
|
except Exception as e:
|
|
return ImplodeResult(
|
|
success=False,
|
|
output_file=output_file,
|
|
files_processed=[],
|
|
variant_detected=self.variant_type,
|
|
warnings=[],
|
|
errors=[f"Error during implosion: {e}"]
|
|
)
|
|
|
|
def can_handle_directory(self, directory: Path) -> bool:
|
|
"""
|
|
Check if this variant can handle the given directory structure.
|
|
|
|
Args:
|
|
directory: Path to the directory to check
|
|
|
|
Returns:
|
|
True if this variant can handle the directory
|
|
"""
|
|
if not directory.exists() or not directory.is_dir():
|
|
return False
|
|
|
|
# Check for manifest indicating flat variant
|
|
manifest_data = self.manifest_manager.read_manifest(directory)
|
|
if manifest_data and manifest_data.explosion_type == "flat":
|
|
return True
|
|
|
|
# Check for flat structure patterns
|
|
subdirs = [d for d in directory.iterdir() if d.is_dir()]
|
|
|
|
# Look for typical flat patterns (no numbered prefixes, no semantic grouping)
|
|
numbered_dirs = sum(1 for d in subdirs if re.match(r'^\d+_', d.name))
|
|
semantic_dirs = sum(1 for d in subdirs
|
|
if any(name in d.name.lower()
|
|
for name in ['parts', 'chapters', 'sections', 'appendices']))
|
|
|
|
# Flat structure has minimal numbered or semantic directories
|
|
return (numbered_dirs / len(subdirs) if subdirs else 0) < 0.3 and \
|
|
(semantic_dirs / len(subdirs) if subdirs else 0) < 0.3
|
|
|
|
def get_detection_patterns(self) -> Dict[str, Any]:
|
|
"""
|
|
Get patterns used for auto-detecting this variant.
|
|
|
|
Returns:
|
|
Dictionary of detection patterns and weights
|
|
"""
|
|
return {
|
|
"manifest_type": "flat",
|
|
"numbered_directory_ratio": {"max": 0.3, "weight": 0.6},
|
|
"semantic_directory_ratio": {"max": 0.3, "weight": 0.5},
|
|
"index_file_count": {"min": 0, "weight": 0.3},
|
|
"fallback_score": 0.6 # Default choice
|
|
}
|
|
|
|
def _explode_flat_structure(
|
|
self,
|
|
input_file: Path,
|
|
output_dir: Path,
|
|
content: str,
|
|
options: ExplodeOptions
|
|
) -> List[Path]:
|
|
"""
|
|
Implement flat structure explosion directly.
|
|
|
|
Creates directories based on h1 headings with nested content.
|
|
This is the traditional behavior for backward compatibility.
|
|
"""
|
|
files_created = []
|
|
|
|
# Extract and save front matter if present and preservation is enabled
|
|
if options.preserve_front_matter:
|
|
frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
|
|
if frontmatter:
|
|
# Save front matter to _frontmatter.yml
|
|
import yaml
|
|
fm_file = output_dir / "_frontmatter.yml"
|
|
fm_content = yaml.dump(frontmatter, default_flow_style=False)
|
|
fm_file.write_text(fm_content, encoding='utf-8')
|
|
files_created.append(fm_file)
|
|
# Use content without front matter for processing
|
|
content = content_without_fm
|
|
|
|
# Parse sections based on headings
|
|
sections = self._parse_flat_sections(content)
|
|
|
|
for section in sections:
|
|
if section['level'] == 1:
|
|
# Create directory for h1 sections
|
|
safe_title = self._sanitize_filename(section['title'])
|
|
section_dir = output_dir / safe_title
|
|
section_dir.mkdir(exist_ok=True)
|
|
|
|
# Create index.md for the main content
|
|
index_file = section_dir / "index.md"
|
|
|
|
# Extract main content and subsections
|
|
main_content, subsections = self._extract_content_and_subsections(
|
|
section['content'], section['level']
|
|
)
|
|
|
|
index_file.write_text(main_content, encoding='utf-8')
|
|
files_created.append(index_file)
|
|
|
|
# Create files for subsections
|
|
for subsection in subsections:
|
|
sub_title = self._sanitize_filename(subsection['title'])
|
|
sub_file = section_dir / f"{sub_title}.md"
|
|
sub_file.write_text(subsection['content'], encoding='utf-8')
|
|
files_created.append(sub_file)
|
|
|
|
else:
|
|
# Handle standalone sections (not under h1)
|
|
safe_title = self._sanitize_filename(section['title'])
|
|
standalone_file = output_dir / f"{safe_title}.md"
|
|
standalone_file.write_text(section['content'], encoding='utf-8')
|
|
files_created.append(standalone_file)
|
|
|
|
return files_created
|
|
|
|
def _implode_flat_structure(
|
|
self,
|
|
input_directory: Path,
|
|
manifest_data: Any,
|
|
options: ImplodeOptions
|
|
) -> tuple[str, List[Path]]:
|
|
"""
|
|
Implement flat structure implosion directly.
|
|
|
|
Reconstructs markdown content from flat directory structure.
|
|
"""
|
|
content_parts = []
|
|
files_processed = []
|
|
|
|
# If we have manifest data, use it for proper ordering
|
|
if manifest_data and hasattr(manifest_data, 'structure'):
|
|
# Use manifest to determine file order
|
|
output_file = options.output_file
|
|
for entry in sorted(manifest_data.structure, key=lambda x: x.order):
|
|
file_path = input_directory / entry.path
|
|
if (file_path.exists() and
|
|
file_path.name != "manifest.md" and
|
|
(output_file is None or file_path.resolve() != output_file.resolve())):
|
|
file_content = file_path.read_text(encoding='utf-8')
|
|
content_parts.append(file_content)
|
|
files_processed.append(file_path)
|
|
else:
|
|
# Fallback: collect all markdown files recursively (legacy behavior)
|
|
# This ensures compatibility with tests that expect all nested files to be processed
|
|
all_md_files = []
|
|
|
|
# Collect all markdown files recursively, excluding output file if it exists
|
|
output_file = options.output_file
|
|
for md_file in input_directory.rglob("*.md"):
|
|
if (md_file.name != "manifest.md" and
|
|
(output_file is None or md_file.resolve() != output_file.resolve())):
|
|
all_md_files.append(md_file)
|
|
|
|
# Sort files by their path to ensure consistent ordering
|
|
all_md_files.sort(key=lambda f: str(f.relative_to(input_directory)))
|
|
|
|
# Process all found markdown files
|
|
for md_file in all_md_files:
|
|
content = md_file.read_text(encoding='utf-8')
|
|
content_parts.append(content)
|
|
files_processed.append(md_file)
|
|
|
|
# Check for legacy front matter file (from old explode system)
|
|
legacy_front_matter = None
|
|
fm_file = input_directory / '_frontmatter.yml'
|
|
if fm_file.exists() and options.preserve_front_matter:
|
|
try:
|
|
legacy_front_matter = fm_file.read_text(encoding='utf-8').strip()
|
|
except Exception:
|
|
pass # Ignore errors reading front matter
|
|
|
|
# Normalize content parts - remove excessive leading/trailing whitespace but preserve content
|
|
normalized_parts = []
|
|
for part in content_parts:
|
|
if part:
|
|
# Remove excessive leading/trailing newlines but preserve internal structure
|
|
normalized = part.strip('\r\n')
|
|
if normalized:
|
|
normalized_parts.append(normalized)
|
|
|
|
# Join content with appropriate spacing
|
|
spacing = '\n' * (options.section_spacing + 1)
|
|
full_content = spacing.join(normalized_parts)
|
|
|
|
# Add front matter to the beginning if found
|
|
if legacy_front_matter and options.preserve_front_matter:
|
|
full_content = f"---\n{legacy_front_matter}\n---\n\n{full_content}"
|
|
|
|
return full_content, files_processed
|
|
|
|
def _parse_flat_sections(self, content: str) -> List[Dict[str, Any]]:
|
|
"""Parse content into sections for flat structure."""
|
|
sections = []
|
|
lines = content.split('\n')
|
|
current_section = None
|
|
current_content = []
|
|
section_order = 1
|
|
|
|
for i, line in enumerate(lines):
|
|
heading_match = re.match(r'^(#{1,6})\s+(.+)', line)
|
|
|
|
if heading_match:
|
|
# Save previous section
|
|
if current_section:
|
|
current_section['content'] = '\n'.join(current_content)
|
|
sections.append(current_section)
|
|
|
|
# Start new section
|
|
level = len(heading_match.group(1))
|
|
title = heading_match.group(2).strip()
|
|
|
|
current_section = {
|
|
'level': level,
|
|
'title': title,
|
|
'order': section_order,
|
|
'start_line': i + 1
|
|
}
|
|
current_content = [line]
|
|
section_order += 1
|
|
else:
|
|
if current_content:
|
|
current_content.append(line)
|
|
|
|
# Handle last section
|
|
if current_section:
|
|
current_section['content'] = '\n'.join(current_content)
|
|
sections.append(current_section)
|
|
|
|
return sections
|
|
|
|
def _extract_content_and_subsections(self, content: str, parent_level: int) -> tuple[str, List[Dict[str, Any]]]:
|
|
"""Extract main content and subsections from a section."""
|
|
lines = content.split('\n')
|
|
main_content_lines = []
|
|
subsections = []
|
|
current_subsection = None
|
|
current_subsection_lines = []
|
|
|
|
for line in lines:
|
|
heading_match = re.match(r'^(#{1,6})\s+(.+)', line)
|
|
|
|
if heading_match:
|
|
level = len(heading_match.group(1))
|
|
title = heading_match.group(2).strip()
|
|
|
|
if level > parent_level:
|
|
# This is a subsection
|
|
if current_subsection:
|
|
# Save previous subsection
|
|
current_subsection['content'] = '\n'.join(current_subsection_lines)
|
|
subsections.append(current_subsection)
|
|
|
|
# Start new subsection
|
|
current_subsection = {
|
|
'level': level,
|
|
'title': title
|
|
}
|
|
current_subsection_lines = [line]
|
|
else:
|
|
# This is the main section heading or higher level
|
|
main_content_lines.append(line)
|
|
else:
|
|
# Regular content line
|
|
if current_subsection:
|
|
current_subsection_lines.append(line)
|
|
else:
|
|
main_content_lines.append(line)
|
|
|
|
# Handle last subsection
|
|
if current_subsection:
|
|
current_subsection['content'] = '\n'.join(current_subsection_lines)
|
|
subsections.append(current_subsection)
|
|
|
|
main_content = '\n'.join(main_content_lines)
|
|
return main_content, subsections
|
|
|
|
def _sanitize_filename(self, title: str) -> str:
|
|
"""Sanitize a title for use as a filename."""
|
|
# Remove markdown heading markers
|
|
title = re.sub(r'^#+\s*', '', title)
|
|
# Remove special characters
|
|
safe_title = re.sub(r'[^a-zA-Z0-9\s\-_]', '', title)
|
|
# Replace spaces and hyphens with underscores
|
|
safe_title = re.sub(r'[\s\-]+', '_', safe_title)
|
|
# Convert to lowercase
|
|
safe_title = safe_title.lower()
|
|
# Remove leading/trailing underscores
|
|
safe_title = safe_title.strip('_')
|
|
# Limit length
|
|
if len(safe_title) > 50:
|
|
safe_title = safe_title[:50].rstrip('_')
|
|
return safe_title or 'untitled'
|
|
|
|
def _basic_explode_implementation(
|
|
self,
|
|
input_file: Path,
|
|
output_dir: Path,
|
|
content: str
|
|
) -> List[Path]:
|
|
"""Basic explode implementation for testing purposes."""
|
|
files_created = []
|
|
|
|
# Simple h1-based splitting
|
|
sections = re.split(r'\n# ', content)
|
|
|
|
for i, section in enumerate(sections):
|
|
if not section.strip():
|
|
continue
|
|
|
|
if i == 0:
|
|
# First section might not have leading #
|
|
if not section.startswith('#'):
|
|
section = '# ' + section
|
|
else:
|
|
# Add back the # that was removed by split
|
|
section = '# ' + section
|
|
|
|
# Extract title
|
|
lines = section.split('\n')
|
|
title_line = lines[0]
|
|
title = re.sub(r'^#\s*', '', title_line).strip()
|
|
|
|
# Create directory and file
|
|
safe_title = re.sub(r'[^\w\s-]', '', title).strip()
|
|
safe_title = re.sub(r'[-\s]+', '_', safe_title).lower()
|
|
|
|
section_dir = output_dir / safe_title
|
|
section_dir.mkdir(exist_ok=True)
|
|
|
|
file_path = section_dir / "index.md"
|
|
file_path.write_text(section, encoding='utf-8')
|
|
files_created.append(file_path)
|
|
|
|
return files_created
|
|
|
|
def _basic_implode_implementation(self, input_directory: Path) -> tuple[str, List[Path]]:
|
|
"""Basic implode implementation for testing purposes."""
|
|
content_parts = []
|
|
files_processed = []
|
|
|
|
# Find all markdown files
|
|
md_files = sorted(input_directory.glob("**/*.md"))
|
|
|
|
for file_path in md_files:
|
|
if file_path.name == "manifest.md":
|
|
continue
|
|
|
|
file_content = file_path.read_text(encoding='utf-8')
|
|
content_parts.append(file_content)
|
|
files_processed.append(file_path)
|
|
|
|
# Join with appropriate spacing
|
|
full_content = '\n\n\n\n'.join(content_parts)
|
|
|
|
return full_content, files_processed
|
|
|
|
def _analyze_structure(self, content: str, output_dir: Path) -> List[StructureEntry]:
|
|
"""Analyze the content structure for manifest generation."""
|
|
structure = []
|
|
lines = content.split('\n')
|
|
|
|
order = 1
|
|
for i, line in enumerate(lines):
|
|
# Check for headings
|
|
heading_match = re.match(r'^(#{1,6})\s+(.+)', line)
|
|
if heading_match:
|
|
level = len(heading_match.group(1))
|
|
title = heading_match.group(2).strip()
|
|
|
|
# Generate path based on title using same sanitization as file creation
|
|
safe_title = self._sanitize_filename(title)
|
|
|
|
if level == 1:
|
|
path = f"{safe_title}/index.md"
|
|
else:
|
|
path = f"{safe_title}.md"
|
|
|
|
structure.append(StructureEntry(
|
|
type=f"h{level}",
|
|
title=title,
|
|
path=path,
|
|
order=order,
|
|
level=level,
|
|
original_line=i + 1
|
|
))
|
|
order += 1
|
|
|
|
return structure |