""" Semantic variant implementation for explode-implode operations. This variant creates content-based directory groupings that reflect the semantic structure of the document, organizing by meaning rather than order. """ import re from pathlib import Path from typing import Dict, List, Any, Optional, Tuple, Set from .base_variant import ( BaseVariant, ExplodeOptions, ImplodeOptions, ExplodeResult, ImplodeResult ) from .enums import ExplodeVariant from .manifest_manager import ManifestManager, StructureEntry from ..matter_frontmatter.parser import FrontmatterParser class SemanticVariant(BaseVariant): """ Semantic variant implementation. Creates content-based directory groupings that organize content by semantic meaning rather than document order. Groups related content together based on keywords and content analysis. Structure example: book.mdd/ ├── manifest.md ├── introduction/ │ ├── overview.md │ ├── scope.md │ └── objectives.md ├── chapters/ │ ├── fundamentals.md │ ├── advanced_topics.md │ └── case_studies.md ├── appendices/ │ ├── references.md │ ├── glossary.md │ └── index.md └── conclusion/ └── summary.md """ # Semantic group definitions SEMANTIC_GROUPS = { 'introduction': { 'keywords': ['introduction', 'overview', 'preface', 'foreword', 'abstract', 'summary', 'about', 'welcome', 'getting started'], 'patterns': [r'intro', r'begin', r'start', r'overview'], 'order': 1 }, 'chapters': { 'keywords': ['chapter', 'section', 'part', 'topic', 'lesson', 'content', 'main', 'core', 'body', 'details'], 'patterns': [r'chapter\s*\d+', r'part\s*\d+', r'section\s*\d+'], 'order': 2 }, 'tutorials': { 'keywords': ['tutorial', 'guide', 'howto', 'how-to', 'walkthrough', 'example', 'demo', 'practice', 'exercise'], 'patterns': [r'tutorial', r'guide', r'how\s*to', r'step\s*by\s*step'], 'order': 3 }, 'reference': { 'keywords': ['reference', 'api', 'documentation', 'spec', 'specification', 'manual', 'docs', 'command', 'function'], 'patterns': [r'api', r'reference', r'spec', r'manual'], 'order': 4 }, 'appendices': { 'keywords': ['appendix', 'appendices', 'glossary', 'index', 'bibliography', 'references', 'credits', 'acknowledgments', 'notes'], 'patterns': [r'appendix', r'glossary', r'bibliography'], 'order': 5 }, 'conclusion': { 'keywords': ['conclusion', 'summary', 'final', 'end', 'closing', 'wrap-up', 'takeaway', 'results', 'outcome'], 'patterns': [r'conclusion', r'summary', r'final', r'end'], 'order': 6 } } def __init__(self): """Initialize the semantic variant.""" super().__init__(ExplodeVariant.SEMANTIC) self.manifest_manager = ManifestManager() self.frontmatter_parser = FrontmatterParser() @property def name(self) -> str: """Human-readable name of the variant.""" return "Semantic Structure" @property def description(self) -> str: """Description of the variant's behavior.""" return ("Creates content-based directory groupings that organize content by " "semantic meaning. Groups related content together based on keywords " "and content analysis.") def explode( self, input_file: Path, options: ExplodeOptions ) -> ExplodeResult: """ Explode a markdown file using the semantic structure variant. Args: input_file: Path to the markdown file to explode options: Options controlling the explode operation Returns: Result of the explode operation """ # Validate input validation_errors = self.validate_input_file(input_file) if validation_errors: return ExplodeResult( success=False, output_directory=options.output_dir or Path(), files_created=[], manifest_path=None, warnings=[], errors=validation_errors, variant_used=self.variant_type ) # Determine output directory if options.output_dir: output_dir = options.output_dir else: suffix = ".mdd" if options.create_manifest else "_exploded" output_dir = input_file.parent / f"{input_file.stem}{suffix}" # Create output directory creation_errors = self.create_output_directory(output_dir, overwrite=True) if creation_errors: return ExplodeResult( success=False, output_directory=output_dir, files_created=[], manifest_path=None, warnings=[], errors=creation_errors, variant_used=self.variant_type ) try: # Parse the markdown content content = input_file.read_text(encoding='utf-8') # Extract and save front matter if present and preservation is enabled files_created = [] if options.preserve_front_matter: frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content) if frontmatter: # Save front matter to _frontmatter.yml import yaml fm_file = output_dir / "_frontmatter.yml" fm_content = yaml.dump(frontmatter, default_flow_style=False) fm_file.write_text(fm_content, encoding='utf-8') files_created.append(fm_file) # Use content without front matter for processing content = content_without_fm # Analyze document structure and classify sections semantically sections = self._parse_semantic_structure(content) # Group sections by semantic meaning semantic_groups = self._group_sections_semantically(sections) # Create semantic directory structure semantic_files = self._create_semantic_structure( output_dir, semantic_groups, options ) # Create manifest if requested manifest_path = None if options.create_manifest: structure = self._build_structure_entries(semantic_groups) manifest_path = self.manifest_manager.create_manifest( output_dir=output_dir, original_file=input_file, variant=self.variant_type, structure=structure, preservation_options={ "front_matter": options.preserve_front_matter, "section_order": True, "heading_levels": True, "semantic_grouping": True } ) semantic_files.append(manifest_path) # Combine all created files all_files = files_created + semantic_files return ExplodeResult( success=True, output_directory=output_dir, files_created=all_files, manifest_path=manifest_path, warnings=[], errors=[], variant_used=self.variant_type ) except Exception as e: return ExplodeResult( success=False, output_directory=output_dir, files_created=[], manifest_path=None, warnings=[], errors=[f"Error during semantic explosion: {e}"], variant_used=self.variant_type ) def implode( self, input_directory: Path, options: ImplodeOptions ) -> ImplodeResult: """ Implode a semantic directory structure back into a markdown file. Args: input_directory: Path to the directory to implode options: Options controlling the implode operation Returns: Result of the implode operation """ # Validate input validation_errors = self.validate_input_directory(input_directory) if validation_errors: return ImplodeResult( success=False, output_file=options.output_file or Path(), files_processed=[], variant_detected=self.variant_type, warnings=[], errors=validation_errors ) # Determine output file if options.output_file: output_file = options.output_file else: output_file = input_directory.parent / f"{input_directory.name}_imploded.md" try: # Read manifest if available manifest_data = self.manifest_manager.read_manifest(input_directory) # Reconstruct content from semantic structure content, files_processed = self._reconstruct_from_semantics( input_directory, manifest_data, options ) # Add front matter if present and preservation is enabled if options.preserve_front_matter: fm_file = input_directory / '_frontmatter.yml' if fm_file.exists(): try: import yaml frontmatter_content = fm_file.read_text(encoding='utf-8').strip() content = f"---\n{frontmatter_content}\n---\n\n{content}" except Exception: pass # Ignore errors reading front matter # Write output file if not options.dry_run: output_file.write_text(content, encoding='utf-8') return ImplodeResult( success=True, output_file=output_file, files_processed=files_processed, variant_detected=self.variant_type, warnings=[], errors=[] ) except Exception as e: return ImplodeResult( success=False, output_file=output_file, files_processed=[], variant_detected=self.variant_type, warnings=[], errors=[f"Error during semantic implosion: {e}"] ) def can_handle_directory(self, directory: Path) -> bool: """ Check if this variant can handle the given directory structure. Args: directory: Path to the directory to check Returns: True if this variant can handle the directory """ if not directory.exists() or not directory.is_dir(): return False # Check for manifest indicating semantic variant manifest_data = self.manifest_manager.read_manifest(directory) if manifest_data and manifest_data.explosion_type == "semantic": return True # Check for semantic directory patterns subdirs = [d for d in directory.iterdir() if d.is_dir()] # Look for semantic directory names semantic_names = set() for group_name, group_data in self.SEMANTIC_GROUPS.items(): semantic_names.update(group_data['keywords']) semantic_matches = 0 for subdir in subdirs: dir_name_lower = subdir.name.lower() if any(keyword in dir_name_lower for keyword in semantic_names): semantic_matches += 1 # High ratio of semantic directories indicates semantic structure return (semantic_matches / len(subdirs) if subdirs else 0) > 0.4 def get_detection_patterns(self) -> Dict[str, Any]: """ Get patterns used for auto-detecting this variant. Returns: Dictionary of detection patterns and weights """ return { "manifest_type": "semantic", "semantic_directory_ratio": {"min": 0.4, "weight": 0.7}, "keyword_matches": {"weight": 0.6}, "numbered_directory_ratio": {"max": 0.2, "weight": 0.4}, "semantic_patterns": {"weight": 0.8} } def _parse_semantic_structure(self, content: str) -> List[Dict[str, Any]]: """ Parse markdown content into sections with semantic analysis. Args: content: Markdown content to parse Returns: List of section dictionaries with semantic information """ sections = [] lines = content.split('\n') current_section = None current_content = [] section_counter = 1 for i, line in enumerate(lines): # Check for headings heading_match = re.match(r'^(#{1,6})\s+(.+)', line) if heading_match: # Save previous section if current_section: current_section['content'] = '\n'.join(current_content) current_section['end_line'] = i # Analyze semantic meaning current_section['semantic_info'] = self._analyze_semantic_meaning( current_section['title'], current_section['content'] ) sections.append(current_section) # Start new section level = len(heading_match.group(1)) title = heading_match.group(2).strip() current_section = { 'level': level, 'title': title, 'start_line': i + 1, 'order': section_counter, 'parent': self._find_parent_section(sections, level) } current_content = [line] section_counter += 1 else: if current_content: current_content.append(line) # Handle last section if current_section: current_section['content'] = '\n'.join(current_content) current_section['end_line'] = len(lines) current_section['semantic_info'] = self._analyze_semantic_meaning( current_section['title'], current_section['content'] ) sections.append(current_section) return sections def _analyze_semantic_meaning(self, title: str, content: str) -> Dict[str, Any]: """ Analyze the semantic meaning of a section. Args: title: Section title content: Section content Returns: Dictionary with semantic analysis results """ title_lower = title.lower() content_lower = content.lower() text_combined = f"{title_lower} {content_lower}" # Score against each semantic group group_scores = {} for group_name, group_data in self.SEMANTIC_GROUPS.items(): score = 0.0 # Check keyword matches for keyword in group_data['keywords']: if keyword in title_lower: score += 2.0 # Title matches are weighted higher if keyword in content_lower: score += 1.0 # Check pattern matches for pattern in group_data['patterns']: if re.search(pattern, text_combined, re.IGNORECASE): score += 1.5 group_scores[group_name] = score # Find best matching group best_group = max(group_scores.keys(), key=lambda k: group_scores[k]) best_score = group_scores[best_group] # Additional semantic features features = { 'word_count': len(content.split()), 'has_code_blocks': '```' in content, 'has_lists': bool(re.search(r'^\s*[-*+]\s', content, re.MULTILINE)), 'has_numbered_lists': bool(re.search(r'^\s*\d+\.\s', content, re.MULTILINE)), 'heading_level_1_count': len(re.findall(r'^#\s', content, re.MULTILINE)), 'heading_level_2_count': len(re.findall(r'^##\s', content, re.MULTILINE)) } return { 'best_group': best_group if best_score > 0 else 'chapters', # Default fallback 'confidence': min(best_score / 3.0, 1.0), # Normalize to 0-1 'group_scores': group_scores, 'features': features } def _find_parent_section(self, sections: List[Dict[str, Any]], level: int) -> Optional[str]: """ Find the parent section for the current heading level. Args: sections: Previously parsed sections level: Current heading level Returns: Parent section title or None """ # Look for the most recent section with a lower level for section in reversed(sections): if section['level'] < level: return section['title'] return None def _group_sections_semantically(self, sections: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: """ Group sections by their semantic meaning. Args: sections: Parsed sections with semantic analysis Returns: Dictionary of semantic groups containing sections """ groups = {group_name: [] for group_name in self.SEMANTIC_GROUPS.keys()} # Add an 'other' group for unclassified content groups['other'] = [] for section in sections: semantic_info = section.get('semantic_info', {}) best_group = semantic_info.get('best_group', 'other') confidence = semantic_info.get('confidence', 0.0) # Only place in semantic group if confidence is reasonable if confidence > 0.2 and best_group in groups: groups[best_group].append(section) else: groups['other'].append(section) # Remove empty groups return {k: v for k, v in groups.items() if v} def _create_semantic_structure( self, output_dir: Path, semantic_groups: Dict[str, List[Dict[str, Any]]], options: ExplodeOptions ) -> List[Path]: """ Create the semantic directory structure from grouped sections. Args: output_dir: Output directory for the structure semantic_groups: Sections grouped by semantic meaning options: Explode options Returns: List of created file paths """ files_created = [] # Process groups in semantic order group_order = sorted( semantic_groups.keys(), key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999) ) for group_name in group_order: sections = semantic_groups[group_name] if not sections: continue # Create group directory group_dir = output_dir / group_name group_dir.mkdir(exist_ok=True) # Process sections in this group for section in sections: # Generate filename from title safe_title = self._sanitize_filename(section['title']) filename = f"{safe_title}.md" # Avoid conflicts file_path = group_dir / filename counter = 1 while file_path.exists(): base_name = safe_title filename = f"{base_name}_{counter}.md" file_path = group_dir / filename counter += 1 # Write section content file_path.write_text(section['content'], encoding='utf-8') files_created.append(file_path) return files_created def _sanitize_filename(self, title: str) -> str: """ Sanitize a title for use as a filename. Args: title: Original title Returns: Sanitized filename """ # Remove markdown heading markers title = re.sub(r'^#+\s*', '', title) # Remove special characters safe_title = re.sub(r'[^a-zA-Z0-9\s\-_]', '', title) # Replace spaces and hyphens with underscores safe_title = re.sub(r'[\s\-]+', '_', safe_title) # Convert to lowercase safe_title = safe_title.lower() # Remove leading/trailing underscores safe_title = safe_title.strip('_') # Limit length if len(safe_title) > 50: safe_title = safe_title[:50].rstrip('_') return safe_title or 'untitled' def _build_structure_entries(self, semantic_groups: Dict[str, List[Dict[str, Any]]]) -> List[StructureEntry]: """ Build structure entries for manifest from semantic groups. Args: semantic_groups: Sections grouped by semantic meaning Returns: List of structure entries """ entries = [] # Collect all sections from all groups and sort by original document order all_sections = [] for group_name, sections in semantic_groups.items(): for section in sections: section['group_name'] = group_name all_sections.append(section) # Sort by original document order (using the 'order' field from parsing) all_sections.sort(key=lambda s: s.get('order', 0)) # Create structure entries preserving original document order for section in all_sections: safe_title = self._sanitize_filename(section['title']) path = f"{section['group_name']}/{safe_title}.md" entry = StructureEntry( type=f"h{section['level']}", title=section['title'], path=path, order=section.get('order', 0), # Use original document order parent=section.get('parent'), level=section['level'], original_line=section.get('start_line') ) entries.append(entry) return entries def _reconstruct_from_semantics( self, input_directory: Path, manifest_data: Any, options: ImplodeOptions ) -> Tuple[str, List[Path]]: """ Reconstruct markdown content from semantic directory structure. Args: input_directory: Directory containing semantic structure manifest_data: Manifest data if available options: Implode options Returns: Tuple of (reconstructed_content, files_processed) """ content_parts = [] files_processed = [] # Get all directories and files and use manifest order to preserve original structure if manifest_data and hasattr(manifest_data, 'structure'): # Use manifest data to reconstruct in original document order for entry in sorted(manifest_data.structure, key=lambda x: x.order): file_path = input_directory / entry.path if file_path.exists() and file_path.name != "manifest.md": content = file_path.read_text(encoding='utf-8') content_parts.append(content) files_processed.append(file_path) else: # Fallback: process directories in semantic order subdirs = [d for d in input_directory.iterdir() if d.is_dir()] subdirs = sorted(subdirs, key=lambda d: self.SEMANTIC_GROUPS.get(d.name, {}).get('order', 999)) for subdir in subdirs: # Process markdown files in alphabetical order md_files = sorted(subdir.glob("*.md")) for md_file in md_files: if md_file.name != "manifest.md": content = md_file.read_text(encoding='utf-8') content_parts.append(content) files_processed.append(md_file) # Join with appropriate spacing spacing = '\n' * (options.section_spacing + 1) full_content = spacing.join(content_parts) return full_content, files_processed