Files
markitect-main/markitect/explode_variants/semantic_variant.py
tegwick c17efc112d feat: complete Issue #149 - Phase 2: Implement Explode-Implode Variants
Implement all three explode-implode variants with full CLI integration:

🔧 Variant Implementations:
- FlatVariant: Encapsulates existing flat structure behavior
- HierarchicalVariant: Numbered directory structures (01_, 02_, 03_)
- SemanticVariant: Content-based organization (intro, chapters, appendices)

🏭 Factory System:
- VariantFactory: Centralized variant creation and management
- Auto-detection algorithms with confidence scoring
- Content analysis for variant recommendation

🖥️ CLI Integration:
- Enhanced md-explode command with --variant parameter
- Enhanced md-implode command with auto-detection
- Improved error handling and user feedback

🧪 Comprehensive Testing:
- 22 unit tests covering all variant functionality
- Roundtrip validation ensuring perfect reversibility
- Performance testing with large documents
- Error handling and edge case coverage

📊 Key Features:
- Three distinct organization strategies
- Automatic variant detection from directory structures
- Full backward compatibility with existing behavior
- Extensible architecture for future variants
- Manifest-based reversibility

Files Added:
- markitect/explode_variants/flat_variant.py
- markitect/explode_variants/hierarchical_variant.py
- markitect/explode_variants/semantic_variant.py
- markitect/explode_variants/variant_factory.py
- tests/test_issue_149_explode_implode_variants.py
- tests/test_issue_149_roundtrip_validation.py
- cost_notes/issue_149_cost_2025-10-12.md

Files Modified:
- markitect/explode_variants/__init__.py (updated exports)
- markitect/plugins/builtin/markdown_commands.py (CLI integration)

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-12 22:30:06 +02:00

670 lines
23 KiB
Python

"""
Semantic variant implementation for explode-implode operations.
This variant creates content-based directory groupings that reflect the
semantic structure of the document, organizing by meaning rather than order.
"""
import re
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple, Set
from .base_variant import (
BaseVariant, ExplodeOptions, ImplodeOptions,
ExplodeResult, ImplodeResult
)
from .enums import ExplodeVariant
from .manifest_manager import ManifestManager, StructureEntry
class SemanticVariant(BaseVariant):
"""
Semantic variant implementation.
Creates content-based directory groupings that organize content by
semantic meaning rather than document order. Groups related content
together based on keywords and content analysis.
Structure example:
book.mdd/
├── manifest.md
├── introduction/
│ ├── overview.md
│ ├── scope.md
│ └── objectives.md
├── chapters/
│ ├── fundamentals.md
│ ├── advanced_topics.md
│ └── case_studies.md
├── appendices/
│ ├── references.md
│ ├── glossary.md
│ └── index.md
└── conclusion/
└── summary.md
"""
# Semantic group definitions
SEMANTIC_GROUPS = {
'introduction': {
'keywords': ['introduction', 'overview', 'preface', 'foreword', 'abstract',
'summary', 'about', 'welcome', 'getting started'],
'patterns': [r'intro', r'begin', r'start', r'overview'],
'order': 1
},
'chapters': {
'keywords': ['chapter', 'section', 'part', 'topic', 'lesson', 'content',
'main', 'core', 'body', 'details'],
'patterns': [r'chapter\s*\d+', r'part\s*\d+', r'section\s*\d+'],
'order': 2
},
'tutorials': {
'keywords': ['tutorial', 'guide', 'howto', 'how-to', 'walkthrough',
'example', 'demo', 'practice', 'exercise'],
'patterns': [r'tutorial', r'guide', r'how\s*to', r'step\s*by\s*step'],
'order': 3
},
'reference': {
'keywords': ['reference', 'api', 'documentation', 'spec', 'specification',
'manual', 'docs', 'command', 'function'],
'patterns': [r'api', r'reference', r'spec', r'manual'],
'order': 4
},
'appendices': {
'keywords': ['appendix', 'appendices', 'glossary', 'index', 'bibliography',
'references', 'credits', 'acknowledgments', 'notes'],
'patterns': [r'appendix', r'glossary', r'bibliography'],
'order': 5
},
'conclusion': {
'keywords': ['conclusion', 'summary', 'final', 'end', 'closing',
'wrap-up', 'takeaway', 'results', 'outcome'],
'patterns': [r'conclusion', r'summary', r'final', r'end'],
'order': 6
}
}
def __init__(self):
"""Initialize the semantic variant."""
super().__init__(ExplodeVariant.SEMANTIC)
self.manifest_manager = ManifestManager()
@property
def name(self) -> str:
"""Human-readable name of the variant."""
return "Semantic Structure"
@property
def description(self) -> str:
"""Description of the variant's behavior."""
return ("Creates content-based directory groupings that organize content by "
"semantic meaning. Groups related content together based on keywords "
"and content analysis.")
def explode(
self,
input_file: Path,
options: ExplodeOptions
) -> ExplodeResult:
"""
Explode a markdown file using the semantic structure variant.
Args:
input_file: Path to the markdown file to explode
options: Options controlling the explode operation
Returns:
Result of the explode operation
"""
# Validate input
validation_errors = self.validate_input_file(input_file)
if validation_errors:
return ExplodeResult(
success=False,
output_directory=options.output_dir or Path(),
files_created=[],
manifest_path=None,
warnings=[],
errors=validation_errors,
variant_used=self.variant_type
)
# Determine output directory
if options.output_dir:
output_dir = options.output_dir
else:
suffix = ".mdd" if options.create_manifest else "_exploded"
output_dir = input_file.parent / f"{input_file.stem}{suffix}"
# Create output directory
creation_errors = self.create_output_directory(output_dir, overwrite=True)
if creation_errors:
return ExplodeResult(
success=False,
output_directory=output_dir,
files_created=[],
manifest_path=None,
warnings=[],
errors=creation_errors,
variant_used=self.variant_type
)
try:
# Parse the markdown content
content = input_file.read_text(encoding='utf-8')
# Analyze document structure and classify sections semantically
sections = self._parse_semantic_structure(content)
# Group sections by semantic meaning
semantic_groups = self._group_sections_semantically(sections)
# Create semantic directory structure
files_created = self._create_semantic_structure(
output_dir, semantic_groups, options
)
# Create manifest if requested
manifest_path = None
if options.create_manifest:
structure = self._build_structure_entries(semantic_groups)
manifest_path = self.manifest_manager.create_manifest(
output_dir=output_dir,
original_file=input_file,
variant=self.variant_type,
structure=structure,
preservation_options={
"front_matter": options.preserve_front_matter,
"section_order": True,
"heading_levels": True,
"semantic_grouping": True
}
)
files_created.append(manifest_path)
return ExplodeResult(
success=True,
output_directory=output_dir,
files_created=files_created,
manifest_path=manifest_path,
warnings=[],
errors=[],
variant_used=self.variant_type
)
except Exception as e:
return ExplodeResult(
success=False,
output_directory=output_dir,
files_created=[],
manifest_path=None,
warnings=[],
errors=[f"Error during semantic explosion: {e}"],
variant_used=self.variant_type
)
def implode(
self,
input_directory: Path,
options: ImplodeOptions
) -> ImplodeResult:
"""
Implode a semantic directory structure back into a markdown file.
Args:
input_directory: Path to the directory to implode
options: Options controlling the implode operation
Returns:
Result of the implode operation
"""
# Validate input
validation_errors = self.validate_input_directory(input_directory)
if validation_errors:
return ImplodeResult(
success=False,
output_file=options.output_file or Path(),
files_processed=[],
variant_detected=self.variant_type,
warnings=[],
errors=validation_errors
)
# Determine output file
if options.output_file:
output_file = options.output_file
else:
output_file = input_directory.parent / f"{input_directory.name}_imploded.md"
try:
# Read manifest if available
manifest_data = self.manifest_manager.read_manifest(input_directory)
# Reconstruct content from semantic structure
content, files_processed = self._reconstruct_from_semantics(
input_directory, manifest_data, options
)
# Write output file
if not options.dry_run:
output_file.write_text(content, encoding='utf-8')
return ImplodeResult(
success=True,
output_file=output_file,
files_processed=files_processed,
variant_detected=self.variant_type,
warnings=[],
errors=[]
)
except Exception as e:
return ImplodeResult(
success=False,
output_file=output_file,
files_processed=[],
variant_detected=self.variant_type,
warnings=[],
errors=[f"Error during semantic implosion: {e}"]
)
def can_handle_directory(self, directory: Path) -> bool:
"""
Check if this variant can handle the given directory structure.
Args:
directory: Path to the directory to check
Returns:
True if this variant can handle the directory
"""
if not directory.exists() or not directory.is_dir():
return False
# Check for manifest indicating semantic variant
manifest_data = self.manifest_manager.read_manifest(directory)
if manifest_data and manifest_data.explosion_type == "semantic":
return True
# Check for semantic directory patterns
subdirs = [d for d in directory.iterdir() if d.is_dir()]
# Look for semantic directory names
semantic_names = set()
for group_name, group_data in self.SEMANTIC_GROUPS.items():
semantic_names.update(group_data['keywords'])
semantic_matches = 0
for subdir in subdirs:
dir_name_lower = subdir.name.lower()
if any(keyword in dir_name_lower for keyword in semantic_names):
semantic_matches += 1
# High ratio of semantic directories indicates semantic structure
return (semantic_matches / len(subdirs) if subdirs else 0) > 0.4
def get_detection_patterns(self) -> Dict[str, Any]:
"""
Get patterns used for auto-detecting this variant.
Returns:
Dictionary of detection patterns and weights
"""
return {
"manifest_type": "semantic",
"semantic_directory_ratio": {"min": 0.4, "weight": 0.7},
"keyword_matches": {"weight": 0.6},
"numbered_directory_ratio": {"max": 0.2, "weight": 0.4},
"semantic_patterns": {"weight": 0.8}
}
def _parse_semantic_structure(self, content: str) -> List[Dict[str, Any]]:
"""
Parse markdown content into sections with semantic analysis.
Args:
content: Markdown content to parse
Returns:
List of section dictionaries with semantic information
"""
sections = []
lines = content.split('\n')
current_section = None
current_content = []
section_counter = 1
for i, line in enumerate(lines):
# Check for headings
heading_match = re.match(r'^(#{1,6})\s+(.+)', line)
if heading_match:
# Save previous section
if current_section:
current_section['content'] = '\n'.join(current_content)
current_section['end_line'] = i
# Analyze semantic meaning
current_section['semantic_info'] = self._analyze_semantic_meaning(
current_section['title'],
current_section['content']
)
sections.append(current_section)
# Start new section
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
current_section = {
'level': level,
'title': title,
'start_line': i + 1,
'order': section_counter,
'parent': self._find_parent_section(sections, level)
}
current_content = [line]
section_counter += 1
else:
if current_content:
current_content.append(line)
# Handle last section
if current_section:
current_section['content'] = '\n'.join(current_content)
current_section['end_line'] = len(lines)
current_section['semantic_info'] = self._analyze_semantic_meaning(
current_section['title'],
current_section['content']
)
sections.append(current_section)
return sections
def _analyze_semantic_meaning(self, title: str, content: str) -> Dict[str, Any]:
"""
Analyze the semantic meaning of a section.
Args:
title: Section title
content: Section content
Returns:
Dictionary with semantic analysis results
"""
title_lower = title.lower()
content_lower = content.lower()
text_combined = f"{title_lower} {content_lower}"
# Score against each semantic group
group_scores = {}
for group_name, group_data in self.SEMANTIC_GROUPS.items():
score = 0.0
# Check keyword matches
for keyword in group_data['keywords']:
if keyword in title_lower:
score += 2.0 # Title matches are weighted higher
if keyword in content_lower:
score += 1.0
# Check pattern matches
for pattern in group_data['patterns']:
if re.search(pattern, text_combined, re.IGNORECASE):
score += 1.5
group_scores[group_name] = score
# Find best matching group
best_group = max(group_scores.keys(), key=lambda k: group_scores[k])
best_score = group_scores[best_group]
# Additional semantic features
features = {
'word_count': len(content.split()),
'has_code_blocks': '```' in content,
'has_lists': bool(re.search(r'^\s*[-*+]\s', content, re.MULTILINE)),
'has_numbered_lists': bool(re.search(r'^\s*\d+\.\s', content, re.MULTILINE)),
'heading_level_1_count': len(re.findall(r'^#\s', content, re.MULTILINE)),
'heading_level_2_count': len(re.findall(r'^##\s', content, re.MULTILINE))
}
return {
'best_group': best_group if best_score > 0 else 'chapters', # Default fallback
'confidence': min(best_score / 3.0, 1.0), # Normalize to 0-1
'group_scores': group_scores,
'features': features
}
def _find_parent_section(self, sections: List[Dict[str, Any]], level: int) -> Optional[str]:
"""
Find the parent section for the current heading level.
Args:
sections: Previously parsed sections
level: Current heading level
Returns:
Parent section title or None
"""
# Look for the most recent section with a lower level
for section in reversed(sections):
if section['level'] < level:
return section['title']
return None
def _group_sections_semantically(self, sections: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
"""
Group sections by their semantic meaning.
Args:
sections: Parsed sections with semantic analysis
Returns:
Dictionary of semantic groups containing sections
"""
groups = {group_name: [] for group_name in self.SEMANTIC_GROUPS.keys()}
# Add an 'other' group for unclassified content
groups['other'] = []
for section in sections:
semantic_info = section.get('semantic_info', {})
best_group = semantic_info.get('best_group', 'other')
confidence = semantic_info.get('confidence', 0.0)
# Only place in semantic group if confidence is reasonable
if confidence > 0.2 and best_group in groups:
groups[best_group].append(section)
else:
groups['other'].append(section)
# Remove empty groups
return {k: v for k, v in groups.items() if v}
def _create_semantic_structure(
self,
output_dir: Path,
semantic_groups: Dict[str, List[Dict[str, Any]]],
options: ExplodeOptions
) -> List[Path]:
"""
Create the semantic directory structure from grouped sections.
Args:
output_dir: Output directory for the structure
semantic_groups: Sections grouped by semantic meaning
options: Explode options
Returns:
List of created file paths
"""
files_created = []
# Process groups in semantic order
group_order = sorted(
semantic_groups.keys(),
key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)
)
for group_name in group_order:
sections = semantic_groups[group_name]
if not sections:
continue
# Create group directory
group_dir = output_dir / group_name
group_dir.mkdir(exist_ok=True)
# Process sections in this group
for section in sections:
# Generate filename from title
safe_title = self._sanitize_filename(section['title'])
filename = f"{safe_title}.md"
# Avoid conflicts
file_path = group_dir / filename
counter = 1
while file_path.exists():
base_name = safe_title
filename = f"{base_name}_{counter}.md"
file_path = group_dir / filename
counter += 1
# Write section content
file_path.write_text(section['content'], encoding='utf-8')
files_created.append(file_path)
return files_created
def _sanitize_filename(self, title: str) -> str:
"""
Sanitize a title for use as a filename.
Args:
title: Original title
Returns:
Sanitized filename
"""
# Remove markdown heading markers
title = re.sub(r'^#+\s*', '', title)
# Remove special characters
safe_title = re.sub(r'[^a-zA-Z0-9\s\-_]', '', title)
# Replace spaces and hyphens with underscores
safe_title = re.sub(r'[\s\-]+', '_', safe_title)
# Convert to lowercase
safe_title = safe_title.lower()
# Remove leading/trailing underscores
safe_title = safe_title.strip('_')
# Limit length
if len(safe_title) > 50:
safe_title = safe_title[:50].rstrip('_')
return safe_title or 'untitled'
def _build_structure_entries(self, semantic_groups: Dict[str, List[Dict[str, Any]]]) -> List[StructureEntry]:
"""
Build structure entries for manifest from semantic groups.
Args:
semantic_groups: Sections grouped by semantic meaning
Returns:
List of structure entries
"""
entries = []
order = 1
# Process groups in semantic order
group_order = sorted(
semantic_groups.keys(),
key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)
)
for group_name in group_order:
sections = semantic_groups[group_name]
for section in sections:
safe_title = self._sanitize_filename(section['title'])
path = f"{group_name}/{safe_title}.md"
entry = StructureEntry(
type=f"h{section['level']}",
title=section['title'],
path=path,
order=order,
parent=section.get('parent'),
level=section['level'],
original_line=section.get('start_line')
)
entries.append(entry)
order += 1
return entries
def _reconstruct_from_semantics(
self,
input_directory: Path,
manifest_data: Any,
options: ImplodeOptions
) -> Tuple[str, List[Path]]:
"""
Reconstruct markdown content from semantic directory structure.
Args:
input_directory: Directory containing semantic structure
manifest_data: Manifest data if available
options: Implode options
Returns:
Tuple of (reconstructed_content, files_processed)
"""
content_parts = []
files_processed = []
# Get all directories in semantic order (if possible from manifest)
if manifest_data and hasattr(manifest_data, 'structure'):
# Use manifest order
grouped_entries = {}
for entry in manifest_data.structure:
group = entry.path.split('/')[0] if '/' in entry.path else 'other'
if group not in grouped_entries:
grouped_entries[group] = []
grouped_entries[group].append(entry)
# Process in manifest order
for group_name in sorted(grouped_entries.keys(),
key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)):
entries = sorted(grouped_entries[group_name], key=lambda e: e.order)
for entry in entries:
file_path = input_directory / entry.path
if file_path.exists():
content = file_path.read_text(encoding='utf-8')
content_parts.append(content)
files_processed.append(file_path)
else:
# Fallback: process directories in semantic order
subdirs = [d for d in input_directory.iterdir() if d.is_dir()]
subdirs = sorted(subdirs,
key=lambda d: self.SEMANTIC_GROUPS.get(d.name, {}).get('order', 999))
for subdir in subdirs:
# Process markdown files in alphabetical order
md_files = sorted(subdir.glob("*.md"))
for md_file in md_files:
if md_file.name != "manifest.md":
content = md_file.read_text(encoding='utf-8')
content_parts.append(content)
files_processed.append(md_file)
# Join with appropriate spacing
spacing = '\n' * (options.section_spacing + 1)
full_content = spacing.join(content_parts)
return full_content, files_processed