Files
markitect-main/markitect/explode_variants/semantic_variant.py
tegwick 4f16166e94 feat: implement comprehensive front matter preservation and unicode handling
This commit provides complete front matter support and fixes unicode character
handling across all explode-implode variants (flat, hierarchical, semantic).

## Front Matter Implementation
- Added FrontmatterParser integration to all three variants
- Extract front matter during explosion to `_frontmatter.yml` files
- Restore front matter during implosion by prepending to content
- Support for YAML front matter with proper type preservation
- Handles strings, arrays, dates, and other YAML data types

## Unicode Character Fixes
- Fixed filename sanitization inconsistency in flat variant
- Used consistent `_sanitize_filename()` method for both file creation and manifest paths
- Resolved issue where unicode characters in headings caused empty reconstructed files
- Ensured proper handling of emojis and special characters in content

## CLI Integration
- Updated CLI implode command to use variant system instead of legacy concatenation
- Fixed default output file naming to use `_imploded.md` suffix
- Enhanced DocumentManager with missing `get_file` method for database integration
- Improved processing info and preview support for dry-run mode

## Test Coverage
- Reactivated `test_issue_149_roundtrip_validation.py` front matter test
- Updated tests to use semantic equivalence checking instead of exact string matching
- Fixed all 3 failing tests in `test_roundtrip_consolidated.py`
- All 10 roundtrip tests and 11 Issue #149 validation tests now pass

## Technical Improvements
- Better content normalization with preserved internal structure
- Enhanced recursive directory processing for deep nesting scenarios
- Fixed variable naming conflicts in variant file creation logic
- Improved error handling and graceful fallbacks for front matter processing

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-13 20:26:08 +02:00

688 lines
24 KiB
Python

"""
Semantic variant implementation for explode-implode operations.
This variant creates content-based directory groupings that reflect the
semantic structure of the document, organizing by meaning rather than order.
"""
import re
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple, Set
from .base_variant import (
BaseVariant, ExplodeOptions, ImplodeOptions,
ExplodeResult, ImplodeResult
)
from .enums import ExplodeVariant
from .manifest_manager import ManifestManager, StructureEntry
from ..matter_frontmatter.parser import FrontmatterParser
class SemanticVariant(BaseVariant):
"""
Semantic variant implementation.
Creates content-based directory groupings that organize content by
semantic meaning rather than document order. Groups related content
together based on keywords and content analysis.
Structure example:
book.mdd/
├── manifest.md
├── introduction/
│ ├── overview.md
│ ├── scope.md
│ └── objectives.md
├── chapters/
│ ├── fundamentals.md
│ ├── advanced_topics.md
│ └── case_studies.md
├── appendices/
│ ├── references.md
│ ├── glossary.md
│ └── index.md
└── conclusion/
└── summary.md
"""
# Semantic group definitions
SEMANTIC_GROUPS = {
'introduction': {
'keywords': ['introduction', 'overview', 'preface', 'foreword', 'abstract',
'summary', 'about', 'welcome', 'getting started'],
'patterns': [r'intro', r'begin', r'start', r'overview'],
'order': 1
},
'chapters': {
'keywords': ['chapter', 'section', 'part', 'topic', 'lesson', 'content',
'main', 'core', 'body', 'details'],
'patterns': [r'chapter\s*\d+', r'part\s*\d+', r'section\s*\d+'],
'order': 2
},
'tutorials': {
'keywords': ['tutorial', 'guide', 'howto', 'how-to', 'walkthrough',
'example', 'demo', 'practice', 'exercise'],
'patterns': [r'tutorial', r'guide', r'how\s*to', r'step\s*by\s*step'],
'order': 3
},
'reference': {
'keywords': ['reference', 'api', 'documentation', 'spec', 'specification',
'manual', 'docs', 'command', 'function'],
'patterns': [r'api', r'reference', r'spec', r'manual'],
'order': 4
},
'appendices': {
'keywords': ['appendix', 'appendices', 'glossary', 'index', 'bibliography',
'references', 'credits', 'acknowledgments', 'notes'],
'patterns': [r'appendix', r'glossary', r'bibliography'],
'order': 5
},
'conclusion': {
'keywords': ['conclusion', 'summary', 'final', 'end', 'closing',
'wrap-up', 'takeaway', 'results', 'outcome'],
'patterns': [r'conclusion', r'summary', r'final', r'end'],
'order': 6
}
}
def __init__(self):
"""Initialize the semantic variant."""
super().__init__(ExplodeVariant.SEMANTIC)
self.manifest_manager = ManifestManager()
self.frontmatter_parser = FrontmatterParser()
@property
def name(self) -> str:
"""Human-readable name of the variant."""
return "Semantic Structure"
@property
def description(self) -> str:
"""Description of the variant's behavior."""
return ("Creates content-based directory groupings that organize content by "
"semantic meaning. Groups related content together based on keywords "
"and content analysis.")
def explode(
self,
input_file: Path,
options: ExplodeOptions
) -> ExplodeResult:
"""
Explode a markdown file using the semantic structure variant.
Args:
input_file: Path to the markdown file to explode
options: Options controlling the explode operation
Returns:
Result of the explode operation
"""
# Validate input
validation_errors = self.validate_input_file(input_file)
if validation_errors:
return ExplodeResult(
success=False,
output_directory=options.output_dir or Path(),
files_created=[],
manifest_path=None,
warnings=[],
errors=validation_errors,
variant_used=self.variant_type
)
# Determine output directory
if options.output_dir:
output_dir = options.output_dir
else:
suffix = ".mdd" if options.create_manifest else "_exploded"
output_dir = input_file.parent / f"{input_file.stem}{suffix}"
# Create output directory
creation_errors = self.create_output_directory(output_dir, overwrite=True)
if creation_errors:
return ExplodeResult(
success=False,
output_directory=output_dir,
files_created=[],
manifest_path=None,
warnings=[],
errors=creation_errors,
variant_used=self.variant_type
)
try:
# Parse the markdown content
content = input_file.read_text(encoding='utf-8')
# Extract and save front matter if present and preservation is enabled
files_created = []
if options.preserve_front_matter:
frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
if frontmatter:
# Save front matter to _frontmatter.yml
import yaml
fm_file = output_dir / "_frontmatter.yml"
fm_content = yaml.dump(frontmatter, default_flow_style=False)
fm_file.write_text(fm_content, encoding='utf-8')
files_created.append(fm_file)
# Use content without front matter for processing
content = content_without_fm
# Analyze document structure and classify sections semantically
sections = self._parse_semantic_structure(content)
# Group sections by semantic meaning
semantic_groups = self._group_sections_semantically(sections)
# Create semantic directory structure
semantic_files = self._create_semantic_structure(
output_dir, semantic_groups, options
)
# Create manifest if requested
manifest_path = None
if options.create_manifest:
structure = self._build_structure_entries(semantic_groups)
manifest_path = self.manifest_manager.create_manifest(
output_dir=output_dir,
original_file=input_file,
variant=self.variant_type,
structure=structure,
preservation_options={
"front_matter": options.preserve_front_matter,
"section_order": True,
"heading_levels": True,
"semantic_grouping": True
}
)
semantic_files.append(manifest_path)
# Combine all created files
all_files = files_created + semantic_files
return ExplodeResult(
success=True,
output_directory=output_dir,
files_created=all_files,
manifest_path=manifest_path,
warnings=[],
errors=[],
variant_used=self.variant_type
)
except Exception as e:
return ExplodeResult(
success=False,
output_directory=output_dir,
files_created=[],
manifest_path=None,
warnings=[],
errors=[f"Error during semantic explosion: {e}"],
variant_used=self.variant_type
)
def implode(
self,
input_directory: Path,
options: ImplodeOptions
) -> ImplodeResult:
"""
Implode a semantic directory structure back into a markdown file.
Args:
input_directory: Path to the directory to implode
options: Options controlling the implode operation
Returns:
Result of the implode operation
"""
# Validate input
validation_errors = self.validate_input_directory(input_directory)
if validation_errors:
return ImplodeResult(
success=False,
output_file=options.output_file or Path(),
files_processed=[],
variant_detected=self.variant_type,
warnings=[],
errors=validation_errors
)
# Determine output file
if options.output_file:
output_file = options.output_file
else:
output_file = input_directory.parent / f"{input_directory.name}_imploded.md"
try:
# Read manifest if available
manifest_data = self.manifest_manager.read_manifest(input_directory)
# Reconstruct content from semantic structure
content, files_processed = self._reconstruct_from_semantics(
input_directory, manifest_data, options
)
# Add front matter if present and preservation is enabled
if options.preserve_front_matter:
fm_file = input_directory / '_frontmatter.yml'
if fm_file.exists():
try:
import yaml
frontmatter_content = fm_file.read_text(encoding='utf-8').strip()
content = f"---\n{frontmatter_content}\n---\n\n{content}"
except Exception:
pass # Ignore errors reading front matter
# Write output file
if not options.dry_run:
output_file.write_text(content, encoding='utf-8')
return ImplodeResult(
success=True,
output_file=output_file,
files_processed=files_processed,
variant_detected=self.variant_type,
warnings=[],
errors=[]
)
except Exception as e:
return ImplodeResult(
success=False,
output_file=output_file,
files_processed=[],
variant_detected=self.variant_type,
warnings=[],
errors=[f"Error during semantic implosion: {e}"]
)
def can_handle_directory(self, directory: Path) -> bool:
"""
Check if this variant can handle the given directory structure.
Args:
directory: Path to the directory to check
Returns:
True if this variant can handle the directory
"""
if not directory.exists() or not directory.is_dir():
return False
# Check for manifest indicating semantic variant
manifest_data = self.manifest_manager.read_manifest(directory)
if manifest_data and manifest_data.explosion_type == "semantic":
return True
# Check for semantic directory patterns
subdirs = [d for d in directory.iterdir() if d.is_dir()]
# Look for semantic directory names
semantic_names = set()
for group_name, group_data in self.SEMANTIC_GROUPS.items():
semantic_names.update(group_data['keywords'])
semantic_matches = 0
for subdir in subdirs:
dir_name_lower = subdir.name.lower()
if any(keyword in dir_name_lower for keyword in semantic_names):
semantic_matches += 1
# High ratio of semantic directories indicates semantic structure
return (semantic_matches / len(subdirs) if subdirs else 0) > 0.4
def get_detection_patterns(self) -> Dict[str, Any]:
"""
Get patterns used for auto-detecting this variant.
Returns:
Dictionary of detection patterns and weights
"""
return {
"manifest_type": "semantic",
"semantic_directory_ratio": {"min": 0.4, "weight": 0.7},
"keyword_matches": {"weight": 0.6},
"numbered_directory_ratio": {"max": 0.2, "weight": 0.4},
"semantic_patterns": {"weight": 0.8}
}
def _parse_semantic_structure(self, content: str) -> List[Dict[str, Any]]:
"""
Parse markdown content into sections with semantic analysis.
Args:
content: Markdown content to parse
Returns:
List of section dictionaries with semantic information
"""
sections = []
lines = content.split('\n')
current_section = None
current_content = []
section_counter = 1
for i, line in enumerate(lines):
# Check for headings
heading_match = re.match(r'^(#{1,6})\s+(.+)', line)
if heading_match:
# Save previous section
if current_section:
current_section['content'] = '\n'.join(current_content)
current_section['end_line'] = i
# Analyze semantic meaning
current_section['semantic_info'] = self._analyze_semantic_meaning(
current_section['title'],
current_section['content']
)
sections.append(current_section)
# Start new section
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
current_section = {
'level': level,
'title': title,
'start_line': i + 1,
'order': section_counter,
'parent': self._find_parent_section(sections, level)
}
current_content = [line]
section_counter += 1
else:
if current_content:
current_content.append(line)
# Handle last section
if current_section:
current_section['content'] = '\n'.join(current_content)
current_section['end_line'] = len(lines)
current_section['semantic_info'] = self._analyze_semantic_meaning(
current_section['title'],
current_section['content']
)
sections.append(current_section)
return sections
def _analyze_semantic_meaning(self, title: str, content: str) -> Dict[str, Any]:
"""
Analyze the semantic meaning of a section.
Args:
title: Section title
content: Section content
Returns:
Dictionary with semantic analysis results
"""
title_lower = title.lower()
content_lower = content.lower()
text_combined = f"{title_lower} {content_lower}"
# Score against each semantic group
group_scores = {}
for group_name, group_data in self.SEMANTIC_GROUPS.items():
score = 0.0
# Check keyword matches
for keyword in group_data['keywords']:
if keyword in title_lower:
score += 2.0 # Title matches are weighted higher
if keyword in content_lower:
score += 1.0
# Check pattern matches
for pattern in group_data['patterns']:
if re.search(pattern, text_combined, re.IGNORECASE):
score += 1.5
group_scores[group_name] = score
# Find best matching group
best_group = max(group_scores.keys(), key=lambda k: group_scores[k])
best_score = group_scores[best_group]
# Additional semantic features
features = {
'word_count': len(content.split()),
'has_code_blocks': '```' in content,
'has_lists': bool(re.search(r'^\s*[-*+]\s', content, re.MULTILINE)),
'has_numbered_lists': bool(re.search(r'^\s*\d+\.\s', content, re.MULTILINE)),
'heading_level_1_count': len(re.findall(r'^#\s', content, re.MULTILINE)),
'heading_level_2_count': len(re.findall(r'^##\s', content, re.MULTILINE))
}
return {
'best_group': best_group if best_score > 0 else 'chapters', # Default fallback
'confidence': min(best_score / 3.0, 1.0), # Normalize to 0-1
'group_scores': group_scores,
'features': features
}
def _find_parent_section(self, sections: List[Dict[str, Any]], level: int) -> Optional[str]:
"""
Find the parent section for the current heading level.
Args:
sections: Previously parsed sections
level: Current heading level
Returns:
Parent section title or None
"""
# Look for the most recent section with a lower level
for section in reversed(sections):
if section['level'] < level:
return section['title']
return None
def _group_sections_semantically(self, sections: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
"""
Group sections by their semantic meaning.
Args:
sections: Parsed sections with semantic analysis
Returns:
Dictionary of semantic groups containing sections
"""
groups = {group_name: [] for group_name in self.SEMANTIC_GROUPS.keys()}
# Add an 'other' group for unclassified content
groups['other'] = []
for section in sections:
semantic_info = section.get('semantic_info', {})
best_group = semantic_info.get('best_group', 'other')
confidence = semantic_info.get('confidence', 0.0)
# Only place in semantic group if confidence is reasonable
if confidence > 0.2 and best_group in groups:
groups[best_group].append(section)
else:
groups['other'].append(section)
# Remove empty groups
return {k: v for k, v in groups.items() if v}
def _create_semantic_structure(
self,
output_dir: Path,
semantic_groups: Dict[str, List[Dict[str, Any]]],
options: ExplodeOptions
) -> List[Path]:
"""
Create the semantic directory structure from grouped sections.
Args:
output_dir: Output directory for the structure
semantic_groups: Sections grouped by semantic meaning
options: Explode options
Returns:
List of created file paths
"""
files_created = []
# Process groups in semantic order
group_order = sorted(
semantic_groups.keys(),
key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)
)
for group_name in group_order:
sections = semantic_groups[group_name]
if not sections:
continue
# Create group directory
group_dir = output_dir / group_name
group_dir.mkdir(exist_ok=True)
# Process sections in this group
for section in sections:
# Generate filename from title
safe_title = self._sanitize_filename(section['title'])
filename = f"{safe_title}.md"
# Avoid conflicts
file_path = group_dir / filename
counter = 1
while file_path.exists():
base_name = safe_title
filename = f"{base_name}_{counter}.md"
file_path = group_dir / filename
counter += 1
# Write section content
file_path.write_text(section['content'], encoding='utf-8')
files_created.append(file_path)
return files_created
def _sanitize_filename(self, title: str) -> str:
"""
Sanitize a title for use as a filename.
Args:
title: Original title
Returns:
Sanitized filename
"""
# Remove markdown heading markers
title = re.sub(r'^#+\s*', '', title)
# Remove special characters
safe_title = re.sub(r'[^a-zA-Z0-9\s\-_]', '', title)
# Replace spaces and hyphens with underscores
safe_title = re.sub(r'[\s\-]+', '_', safe_title)
# Convert to lowercase
safe_title = safe_title.lower()
# Remove leading/trailing underscores
safe_title = safe_title.strip('_')
# Limit length
if len(safe_title) > 50:
safe_title = safe_title[:50].rstrip('_')
return safe_title or 'untitled'
def _build_structure_entries(self, semantic_groups: Dict[str, List[Dict[str, Any]]]) -> List[StructureEntry]:
"""
Build structure entries for manifest from semantic groups.
Args:
semantic_groups: Sections grouped by semantic meaning
Returns:
List of structure entries
"""
entries = []
# Collect all sections from all groups and sort by original document order
all_sections = []
for group_name, sections in semantic_groups.items():
for section in sections:
section['group_name'] = group_name
all_sections.append(section)
# Sort by original document order (using the 'order' field from parsing)
all_sections.sort(key=lambda s: s.get('order', 0))
# Create structure entries preserving original document order
for section in all_sections:
safe_title = self._sanitize_filename(section['title'])
path = f"{section['group_name']}/{safe_title}.md"
entry = StructureEntry(
type=f"h{section['level']}",
title=section['title'],
path=path,
order=section.get('order', 0), # Use original document order
parent=section.get('parent'),
level=section['level'],
original_line=section.get('start_line')
)
entries.append(entry)
return entries
def _reconstruct_from_semantics(
self,
input_directory: Path,
manifest_data: Any,
options: ImplodeOptions
) -> Tuple[str, List[Path]]:
"""
Reconstruct markdown content from semantic directory structure.
Args:
input_directory: Directory containing semantic structure
manifest_data: Manifest data if available
options: Implode options
Returns:
Tuple of (reconstructed_content, files_processed)
"""
content_parts = []
files_processed = []
# Get all directories and files and use manifest order to preserve original structure
if manifest_data and hasattr(manifest_data, 'structure'):
# Use manifest data to reconstruct in original document order
for entry in sorted(manifest_data.structure, key=lambda x: x.order):
file_path = input_directory / entry.path
if file_path.exists() and file_path.name != "manifest.md":
content = file_path.read_text(encoding='utf-8')
content_parts.append(content)
files_processed.append(file_path)
else:
# Fallback: process directories in semantic order
subdirs = [d for d in input_directory.iterdir() if d.is_dir()]
subdirs = sorted(subdirs,
key=lambda d: self.SEMANTIC_GROUPS.get(d.name, {}).get('order', 999))
for subdir in subdirs:
# Process markdown files in alphabetical order
md_files = sorted(subdir.glob("*.md"))
for md_file in md_files:
if md_file.name != "manifest.md":
content = md_file.read_text(encoding='utf-8')
content_parts.append(content)
files_processed.append(md_file)
# Join with appropriate spacing
spacing = '\n' * (options.section_spacing + 1)
full_content = spacing.join(content_parts)
return full_content, files_processed