Files
markitect-main/markitect/explode_variants/hierarchical_variant.py
tegwick 4f16166e94 feat: implement comprehensive front matter preservation and unicode handling
This commit provides complete front matter support and fixes unicode character
handling across all explode-implode variants (flat, hierarchical, semantic).

## Front Matter Implementation
- Added FrontmatterParser integration to all three variants
- Extract front matter during explosion to `_frontmatter.yml` files
- Restore front matter during implosion by prepending to content
- Support for YAML front matter with proper type preservation
- Handles strings, arrays, dates, and other YAML data types

## Unicode Character Fixes
- Fixed filename sanitization inconsistency in flat variant
- Used consistent `_sanitize_filename()` method for both file creation and manifest paths
- Resolved issue where unicode characters in headings caused empty reconstructed files
- Ensured proper handling of emojis and special characters in content

## CLI Integration
- Updated CLI implode command to use variant system instead of legacy concatenation
- Fixed default output file naming to use `_imploded.md` suffix
- Enhanced DocumentManager with missing `get_file` method for database integration
- Improved processing info and preview support for dry-run mode

## Test Coverage
- Reactivated `test_issue_149_roundtrip_validation.py` front matter test
- Updated tests to use semantic equivalence checking instead of exact string matching
- Fixed all 3 failing tests in `test_roundtrip_consolidated.py`
- All 10 roundtrip tests and 11 Issue #149 validation tests now pass

## Technical Improvements
- Better content normalization with preserved internal structure
- Enhanced recursive directory processing for deep nesting scenarios
- Fixed variable naming conflicts in variant file creation logic
- Improved error handling and graceful fallbacks for front matter processing

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-13 20:26:08 +02:00

659 lines
23 KiB
Python

"""
Hierarchical variant implementation for explode-implode operations.
This variant creates numbered directory structures with semantic hierarchy,
making it easier to understand document organization at a glance.
"""
import re
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
from .base_variant import (
BaseVariant, ExplodeOptions, ImplodeOptions,
ExplodeResult, ImplodeResult
)
from .enums import ExplodeVariant
from .manifest_manager import ManifestManager, StructureEntry
from ..matter_frontmatter.parser import FrontmatterParser
class HierarchicalVariant(BaseVariant):
"""
Hierarchical variant implementation.
Creates numbered directory structures with nested organization.
This provides clear document hierarchy and natural ordering.
Structure example:
book.mdd/
├── manifest.md
├── 01_introduction/
│ ├── index.md
│ ├── 01_overview.md
│ └── 02_scope.md
├── 02_main_content/
│ ├── index.md
│ ├── 01_chapter_one.md
│ └── 02_chapter_two.md
└── 03_conclusion/
└── index.md
"""
def __init__(self):
"""Initialize the hierarchical variant."""
super().__init__(ExplodeVariant.HIERARCHICAL)
self.manifest_manager = ManifestManager()
self.frontmatter_parser = FrontmatterParser()
@property
def name(self) -> str:
"""Human-readable name of the variant."""
return "Hierarchical Structure"
@property
def description(self) -> str:
"""Description of the variant's behavior."""
return ("Creates numbered directory structures with semantic hierarchy. "
"Provides clear document organization and natural ordering.")
def explode(
self,
input_file: Path,
options: ExplodeOptions
) -> ExplodeResult:
"""
Explode a markdown file using the hierarchical structure variant.
Args:
input_file: Path to the markdown file to explode
options: Options controlling the explode operation
Returns:
Result of the explode operation
"""
# Validate input
validation_errors = self.validate_input_file(input_file)
if validation_errors:
return ExplodeResult(
success=False,
output_directory=options.output_dir or Path(),
files_created=[],
manifest_path=None,
warnings=[],
errors=validation_errors,
variant_used=self.variant_type
)
# Determine output directory
if options.output_dir:
output_dir = options.output_dir
else:
suffix = ".mdd" if options.create_manifest else "_exploded"
output_dir = input_file.parent / f"{input_file.stem}{suffix}"
# Create output directory
creation_errors = self.create_output_directory(output_dir, overwrite=True)
if creation_errors:
return ExplodeResult(
success=False,
output_directory=output_dir,
files_created=[],
manifest_path=None,
warnings=[],
errors=creation_errors,
variant_used=self.variant_type
)
try:
# Parse the markdown content
content = input_file.read_text(encoding='utf-8')
# Extract and save front matter if present and preservation is enabled
files_created = []
if options.preserve_front_matter:
frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
if frontmatter:
# Save front matter to _frontmatter.yml
import yaml
fm_file = output_dir / "_frontmatter.yml"
fm_content = yaml.dump(frontmatter, default_flow_style=False)
fm_file.write_text(fm_content, encoding='utf-8')
files_created.append(fm_file)
# Use content without front matter for processing
content = content_without_fm
# Analyze document structure
sections = self._parse_hierarchical_structure(content)
# Create hierarchical directory structure
hierarchy_files = self._create_hierarchical_structure(
output_dir, sections, options
)
# Create manifest if requested
manifest_path = None
if options.create_manifest:
structure = self._build_structure_entries(sections)
manifest_path = self.manifest_manager.create_manifest(
output_dir=output_dir,
original_file=input_file,
variant=self.variant_type,
structure=structure,
preservation_options={
"front_matter": options.preserve_front_matter,
"section_order": True,
"heading_levels": True,
"numbering_scheme": "hierarchical"
}
)
hierarchy_files.append(manifest_path)
# Combine all created files
all_files = files_created + hierarchy_files
return ExplodeResult(
success=True,
output_directory=output_dir,
files_created=all_files,
manifest_path=manifest_path,
warnings=[],
errors=[],
variant_used=self.variant_type
)
except Exception as e:
return ExplodeResult(
success=False,
output_directory=output_dir,
files_created=[],
manifest_path=None,
warnings=[],
errors=[f"Error during hierarchical explosion: {e}"],
variant_used=self.variant_type
)
def implode(
self,
input_directory: Path,
options: ImplodeOptions
) -> ImplodeResult:
"""
Implode a hierarchical directory structure back into a markdown file.
Args:
input_directory: Path to the directory to implode
options: Options controlling the implode operation
Returns:
Result of the implode operation
"""
# Validate input
validation_errors = self.validate_input_directory(input_directory)
if validation_errors:
return ImplodeResult(
success=False,
output_file=options.output_file or Path(),
files_processed=[],
variant_detected=self.variant_type,
warnings=[],
errors=validation_errors
)
# Determine output file
if options.output_file:
output_file = options.output_file
else:
output_file = input_directory.parent / f"{input_directory.name}_imploded.md"
try:
# Read manifest if available
manifest_data = self.manifest_manager.read_manifest(input_directory)
# Reconstruct content from hierarchical structure
content, files_processed = self._reconstruct_from_hierarchy(
input_directory, manifest_data, options
)
# Add front matter if present and preservation is enabled
if options.preserve_front_matter:
fm_file = input_directory / '_frontmatter.yml'
if fm_file.exists():
try:
import yaml
frontmatter_content = fm_file.read_text(encoding='utf-8').strip()
content = f"---\n{frontmatter_content}\n---\n\n{content}"
except Exception:
pass # Ignore errors reading front matter
# Write output file
if not options.dry_run:
output_file.write_text(content, encoding='utf-8')
return ImplodeResult(
success=True,
output_file=output_file,
files_processed=files_processed,
variant_detected=self.variant_type,
warnings=[],
errors=[]
)
except Exception as e:
return ImplodeResult(
success=False,
output_file=output_file,
files_processed=[],
variant_detected=self.variant_type,
warnings=[],
errors=[f"Error during hierarchical implosion: {e}"]
)
def can_handle_directory(self, directory: Path) -> bool:
"""
Check if this variant can handle the given directory structure.
Args:
directory: Path to the directory to check
Returns:
True if this variant can handle the directory
"""
if not directory.exists() or not directory.is_dir():
return False
# Check for manifest indicating hierarchical variant
manifest_data = self.manifest_manager.read_manifest(directory)
if manifest_data and manifest_data.explosion_type == "hierarchical":
return True
# Check for hierarchical structure patterns
subdirs = [d for d in directory.iterdir() if d.is_dir()]
# Look for numbered prefixes (strong hierarchical indicator)
numbered_dirs = sum(1 for d in subdirs if re.match(r'^\d+_', d.name))
# High ratio of numbered directories indicates hierarchical structure
return (numbered_dirs / len(subdirs) if subdirs else 0) > 0.6
def get_detection_patterns(self) -> Dict[str, Any]:
"""
Get patterns used for auto-detecting this variant.
Returns:
Dictionary of detection patterns and weights
"""
return {
"manifest_type": "hierarchical",
"numbered_directory_ratio": {"min": 0.6, "weight": 0.8},
"index_file_count": {"min": 2, "weight": 0.5},
"max_depth": {"min": 2, "weight": 0.4},
"nested_numbered_dirs": {"weight": 0.7}
}
def _parse_hierarchical_structure(self, content: str) -> List[Dict[str, Any]]:
"""
Parse markdown content into hierarchical sections.
Args:
content: Markdown content to parse
Returns:
List of section dictionaries with hierarchy information
"""
sections = []
lines = content.split('\n')
current_section = None
current_content = []
section_counter = 1
for i, line in enumerate(lines):
# Check for headings
heading_match = re.match(r'^(#{1,6})\s+(.+)', line)
if heading_match:
# Save previous section
if current_section:
current_section['content'] = '\n'.join(current_content)
current_section['end_line'] = i
sections.append(current_section)
# Start new section
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
current_section = {
'level': level,
'title': title,
'start_line': i + 1,
'order': section_counter,
'parent': self._find_parent_section(sections, level),
'numbering': self._generate_numbering(sections, level, section_counter)
}
current_content = [line]
section_counter += 1
else:
if current_content:
current_content.append(line)
# Handle last section
if current_section:
current_section['content'] = '\n'.join(current_content)
current_section['end_line'] = len(lines)
sections.append(current_section)
return sections
def _find_parent_section(self, sections: List[Dict[str, Any]], level: int) -> Optional[str]:
"""
Find the parent section for the current heading level.
Args:
sections: Previously parsed sections
level: Current heading level
Returns:
Parent section title or None
"""
# Look for the most recent section with a lower level
for section in reversed(sections):
if section['level'] < level:
return section['title']
return None
def _generate_numbering(self, sections: List[Dict[str, Any]], level: int, order: int) -> str:
"""
Generate hierarchical numbering for a section.
Args:
sections: Previously parsed sections
level: Current heading level
order: Overall section order
Returns:
Hierarchical numbering string (e.g., "01", "02_01", etc.)
"""
if level == 1:
# Count h1 sections
h1_count = sum(1 for s in sections if s['level'] == 1) + 1
return f"{h1_count:02d}"
# Find parent numbering and append subsection number
parent_title = self._find_parent_section(sections, level)
if parent_title:
parent_section = next((s for s in sections if s['title'] == parent_title), None)
if parent_section:
# Count subsections at this level under the same parent
subsection_count = sum(
1 for s in sections
if s['level'] == level and s.get('parent') == parent_title
) + 1
return f"{parent_section['numbering']}_{subsection_count:02d}"
# Fallback numbering
return f"{order:02d}"
def _create_hierarchical_structure(
self,
output_dir: Path,
sections: List[Dict[str, Any]],
options: ExplodeOptions
) -> List[Path]:
"""
Create the hierarchical directory structure from parsed sections.
Args:
output_dir: Output directory for the structure
sections: Parsed sections with hierarchy information
options: Explode options
Returns:
List of created file paths
"""
files_created = []
for section in sections:
# Generate directory name
safe_title = self._sanitize_filename(section['title'])
dir_name = f"{section['numbering']}_{safe_title}"
# Create section directory
section_dir = output_dir / dir_name
section_dir.mkdir(exist_ok=True)
# Create index.md for this section
index_path = section_dir / "index.md"
# Process content - extract subsections if any
main_content, subsections = self._extract_subsections(
section['content'], section['level']
)
# Write main content to index.md
index_path.write_text(main_content, encoding='utf-8')
files_created.append(index_path)
# Create files for subsections
for i, subsection in enumerate(subsections, 1):
subsection_title = subsection.get('title', f'subsection_{i}')
safe_sub_title = self._sanitize_filename(subsection_title)
sub_file_name = f"{i:02d}_{safe_sub_title}.md"
sub_file_path = section_dir / sub_file_name
sub_file_path.write_text(subsection['content'], encoding='utf-8')
files_created.append(sub_file_path)
return files_created
def _extract_subsections(self, content: str, parent_level: int) -> Tuple[str, List[Dict[str, Any]]]:
"""
Extract subsections from section content.
Args:
content: Section content
parent_level: Level of the parent section
Returns:
Tuple of (main_content, subsections_list)
"""
lines = content.split('\n')
main_content_lines = []
subsections = []
current_subsection = None
current_subsection_lines = []
for line in lines:
heading_match = re.match(r'^(#{1,6})\s+(.+)', line)
if heading_match:
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
if level > parent_level:
# This is a subsection
if current_subsection:
# Save previous subsection
current_subsection['content'] = '\n'.join(current_subsection_lines)
subsections.append(current_subsection)
# Start new subsection
current_subsection = {
'level': level,
'title': title
}
current_subsection_lines = [line]
elif level <= parent_level:
# This is the main section heading or a peer section
if level == parent_level:
main_content_lines.append(line)
else:
# Higher-level heading that shouldn't be here in normal parsing
main_content_lines.append(line)
else:
# Regular content line
if current_subsection:
current_subsection_lines.append(line)
else:
main_content_lines.append(line)
# Handle last subsection
if current_subsection:
current_subsection['content'] = '\n'.join(current_subsection_lines)
subsections.append(current_subsection)
main_content = '\n'.join(main_content_lines)
return main_content, subsections
def _sanitize_filename(self, title: str) -> str:
"""
Sanitize a title for use as a filename/directory name.
Args:
title: Original title
Returns:
Sanitized filename
"""
# Remove special characters
safe_title = re.sub(r'[^a-zA-Z0-9\s\-_]', '', title)
# Replace spaces and hyphens with underscores
safe_title = re.sub(r'[\s\-]+', '_', safe_title)
# Convert to lowercase
safe_title = safe_title.lower()
# Remove leading/trailing underscores
safe_title = safe_title.strip('_')
# Limit length
if len(safe_title) > 50:
safe_title = safe_title[:50].rstrip('_')
return safe_title or 'untitled'
def _build_structure_entries(self, sections: List[Dict[str, Any]]) -> List[StructureEntry]:
"""
Build structure entries for manifest from parsed sections.
Args:
sections: Parsed sections
Returns:
List of structure entries
"""
entries = []
for section in sections:
safe_title = self._sanitize_filename(section['title'])
dir_name = f"{section['numbering']}_{safe_title}"
path = f"{dir_name}/index.md"
entry = StructureEntry(
type=f"h{section['level']}",
title=section['title'],
path=path,
order=section['order'],
parent=section.get('parent'),
level=section['level'],
original_line=section.get('start_line')
)
entries.append(entry)
return entries
def _reconstruct_from_hierarchy(
self,
input_directory: Path,
manifest_data: Any,
options: ImplodeOptions
) -> Tuple[str, List[Path]]:
"""
Reconstruct markdown content from hierarchical directory structure.
Args:
input_directory: Directory containing hierarchical structure
manifest_data: Manifest data if available
options: Implode options
Returns:
Tuple of (reconstructed_content, files_processed)
"""
content_parts = []
files_processed = []
# Get all directories and sort them properly
if manifest_data and hasattr(manifest_data, 'structure'):
# Use manifest data to determine proper order
subdirs = []
dir_mapping = {}
# Create mapping of directory names to Path objects
all_dirs = [d for d in input_directory.iterdir()
if d.is_dir() and not d.name.startswith('.')]
for d in all_dirs:
dir_mapping[d.name] = d
# Sort manifest entries by original order
for entry in sorted(manifest_data.structure, key=lambda x: x.order):
dir_name = Path(entry.path).parts[0] if entry.path else ""
if dir_name in dir_mapping and dir_mapping[dir_name] not in subdirs:
subdirs.append(dir_mapping[dir_name])
# Add any remaining directories not in manifest (fallback)
for d in all_dirs:
if d not in subdirs:
subdirs.append(d)
else:
# Fallback: sort by numbering prefix, then by name
subdirs = sorted([
d for d in input_directory.iterdir()
if d.is_dir() and not d.name.startswith('.')
], key=lambda d: (
int(d.name.split('_')[0]) if re.match(r'^\d+_', d.name) else 999,
d.name
))
for subdir in subdirs:
self._process_directory_recursively(subdir, content_parts, files_processed)
# Join with appropriate spacing
spacing = '\n' * (options.section_spacing + 1)
full_content = spacing.join(content_parts)
return full_content, files_processed
def _process_directory_recursively(self, directory: Path, content_parts: List[str], files_processed: List[Path]):
"""
Recursively process a directory and its subdirectories for hierarchical content.
Args:
directory: Directory to process
content_parts: List to append content to
files_processed: List to append processed files to
"""
# Read index.md if it exists
index_file = directory / "index.md"
if index_file.exists():
index_content = index_file.read_text(encoding='utf-8')
content_parts.append(index_content)
files_processed.append(index_file)
# Read other markdown files in this directory
md_files = sorted([
f for f in directory.glob("*.md")
if f.name != "index.md"
], key=lambda f: f.name)
for md_file in md_files:
file_content = md_file.read_text(encoding='utf-8')
content_parts.append(file_content)
files_processed.append(md_file)
# Recursively process subdirectories
subdirs = sorted([
d for d in directory.iterdir()
if d.is_dir() and not d.name.startswith('.')
], key=lambda d: (
int(d.name.split('_')[0]) if re.match(r'^\d+_', d.name) else 999,
d.name
))
for subdir in subdirs:
self._process_directory_recursively(subdir, content_parts, files_processed)