""" Hierarchical variant implementation for explode-implode operations. This variant creates numbered directory structures with semantic hierarchy, making it easier to understand document organization at a glance. """ import re from pathlib import Path from typing import Dict, List, Any, Optional, Tuple from .base_variant import ( BaseVariant, ExplodeOptions, ImplodeOptions, ExplodeResult, ImplodeResult ) from .enums import ExplodeVariant from .manifest_manager import ManifestManager, StructureEntry from ..matter_frontmatter.parser import FrontmatterParser class HierarchicalVariant(BaseVariant): """ Hierarchical variant implementation. Creates numbered directory structures with nested organization. This provides clear document hierarchy and natural ordering. Structure example: book.mdd/ ├── manifest.md ├── 01_introduction/ │ ├── index.md │ ├── 01_overview.md │ └── 02_scope.md ├── 02_main_content/ │ ├── index.md │ ├── 01_chapter_one.md │ └── 02_chapter_two.md └── 03_conclusion/ └── index.md """ def __init__(self): """Initialize the hierarchical variant.""" super().__init__(ExplodeVariant.HIERARCHICAL) self.manifest_manager = ManifestManager() self.frontmatter_parser = FrontmatterParser() @property def name(self) -> str: """Human-readable name of the variant.""" return "Hierarchical Structure" @property def description(self) -> str: """Description of the variant's behavior.""" return ("Creates numbered directory structures with semantic hierarchy. " "Provides clear document organization and natural ordering.") def explode( self, input_file: Path, options: ExplodeOptions ) -> ExplodeResult: """ Explode a markdown file using the hierarchical structure variant. Args: input_file: Path to the markdown file to explode options: Options controlling the explode operation Returns: Result of the explode operation """ # Validate input validation_errors = self.validate_input_file(input_file) if validation_errors: return ExplodeResult( success=False, output_directory=options.output_dir or Path(), files_created=[], manifest_path=None, warnings=[], errors=validation_errors, variant_used=self.variant_type ) # Determine output directory if options.output_dir: output_dir = options.output_dir else: suffix = ".mdd" if options.create_manifest else "_exploded" output_dir = input_file.parent / f"{input_file.stem}{suffix}" # Create output directory creation_errors = self.create_output_directory(output_dir, overwrite=True) if creation_errors: return ExplodeResult( success=False, output_directory=output_dir, files_created=[], manifest_path=None, warnings=[], errors=creation_errors, variant_used=self.variant_type ) try: # Parse the markdown content content = input_file.read_text(encoding='utf-8') # Extract and save front matter if present and preservation is enabled files_created = [] if options.preserve_front_matter: frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content) if frontmatter: # Save front matter to _frontmatter.yml import yaml fm_file = output_dir / "_frontmatter.yml" fm_content = yaml.dump(frontmatter, default_flow_style=False) fm_file.write_text(fm_content, encoding='utf-8') files_created.append(fm_file) # Use content without front matter for processing content = content_without_fm # Analyze document structure sections = self._parse_hierarchical_structure(content) # Create hierarchical directory structure hierarchy_files = self._create_hierarchical_structure( output_dir, sections, options ) # Create manifest if requested manifest_path = None if options.create_manifest: structure = self._build_structure_entries(sections) manifest_path = self.manifest_manager.create_manifest( output_dir=output_dir, original_file=input_file, variant=self.variant_type, structure=structure, preservation_options={ "front_matter": options.preserve_front_matter, "section_order": True, "heading_levels": True, "numbering_scheme": "hierarchical" } ) hierarchy_files.append(manifest_path) # Combine all created files all_files = files_created + hierarchy_files return ExplodeResult( success=True, output_directory=output_dir, files_created=all_files, manifest_path=manifest_path, warnings=[], errors=[], variant_used=self.variant_type ) except Exception as e: return ExplodeResult( success=False, output_directory=output_dir, files_created=[], manifest_path=None, warnings=[], errors=[f"Error during hierarchical explosion: {e}"], variant_used=self.variant_type ) def implode( self, input_directory: Path, options: ImplodeOptions ) -> ImplodeResult: """ Implode a hierarchical directory structure back into a markdown file. Args: input_directory: Path to the directory to implode options: Options controlling the implode operation Returns: Result of the implode operation """ # Validate input validation_errors = self.validate_input_directory(input_directory) if validation_errors: return ImplodeResult( success=False, output_file=options.output_file or Path(), files_processed=[], variant_detected=self.variant_type, warnings=[], errors=validation_errors ) # Determine output file if options.output_file: output_file = options.output_file else: output_file = input_directory.parent / f"{input_directory.name}_imploded.md" try: # Read manifest if available manifest_data = self.manifest_manager.read_manifest(input_directory) # Reconstruct content from hierarchical structure content, files_processed = self._reconstruct_from_hierarchy( input_directory, manifest_data, options ) # Add front matter if present and preservation is enabled if options.preserve_front_matter: fm_file = input_directory / '_frontmatter.yml' if fm_file.exists(): try: import yaml frontmatter_content = fm_file.read_text(encoding='utf-8').strip() content = f"---\n{frontmatter_content}\n---\n\n{content}" except Exception: pass # Ignore errors reading front matter # Write output file if not options.dry_run: output_file.write_text(content, encoding='utf-8') return ImplodeResult( success=True, output_file=output_file, files_processed=files_processed, variant_detected=self.variant_type, warnings=[], errors=[] ) except Exception as e: return ImplodeResult( success=False, output_file=output_file, files_processed=[], variant_detected=self.variant_type, warnings=[], errors=[f"Error during hierarchical implosion: {e}"] ) def can_handle_directory(self, directory: Path) -> bool: """ Check if this variant can handle the given directory structure. Args: directory: Path to the directory to check Returns: True if this variant can handle the directory """ if not directory.exists() or not directory.is_dir(): return False # Check for manifest indicating hierarchical variant manifest_data = self.manifest_manager.read_manifest(directory) if manifest_data and manifest_data.explosion_type == "hierarchical": return True # Check for hierarchical structure patterns subdirs = [d for d in directory.iterdir() if d.is_dir()] # Look for numbered prefixes (strong hierarchical indicator) numbered_dirs = sum(1 for d in subdirs if re.match(r'^\d+_', d.name)) # High ratio of numbered directories indicates hierarchical structure return (numbered_dirs / len(subdirs) if subdirs else 0) > 0.6 def get_detection_patterns(self) -> Dict[str, Any]: """ Get patterns used for auto-detecting this variant. Returns: Dictionary of detection patterns and weights """ return { "manifest_type": "hierarchical", "numbered_directory_ratio": {"min": 0.6, "weight": 0.8}, "index_file_count": {"min": 2, "weight": 0.5}, "max_depth": {"min": 2, "weight": 0.4}, "nested_numbered_dirs": {"weight": 0.7} } def _parse_hierarchical_structure(self, content: str) -> List[Dict[str, Any]]: """ Parse markdown content into hierarchical sections. Args: content: Markdown content to parse Returns: List of section dictionaries with hierarchy information """ sections = [] lines = content.split('\n') current_section = None current_content = [] section_counter = 1 for i, line in enumerate(lines): # Check for headings heading_match = re.match(r'^(#{1,6})\s+(.+)', line) if heading_match: # Save previous section if current_section: current_section['content'] = '\n'.join(current_content) current_section['end_line'] = i sections.append(current_section) # Start new section level = len(heading_match.group(1)) title = heading_match.group(2).strip() current_section = { 'level': level, 'title': title, 'start_line': i + 1, 'order': section_counter, 'parent': self._find_parent_section(sections, level), 'numbering': self._generate_numbering(sections, level, section_counter) } current_content = [line] section_counter += 1 else: if current_content: current_content.append(line) # Handle last section if current_section: current_section['content'] = '\n'.join(current_content) current_section['end_line'] = len(lines) sections.append(current_section) return sections def _find_parent_section(self, sections: List[Dict[str, Any]], level: int) -> Optional[str]: """ Find the parent section for the current heading level. Args: sections: Previously parsed sections level: Current heading level Returns: Parent section title or None """ # Look for the most recent section with a lower level for section in reversed(sections): if section['level'] < level: return section['title'] return None def _generate_numbering(self, sections: List[Dict[str, Any]], level: int, order: int) -> str: """ Generate hierarchical numbering for a section. Args: sections: Previously parsed sections level: Current heading level order: Overall section order Returns: Hierarchical numbering string (e.g., "01", "02_01", etc.) """ if level == 1: # Count h1 sections h1_count = sum(1 for s in sections if s['level'] == 1) + 1 return f"{h1_count:02d}" # Find parent numbering and append subsection number parent_title = self._find_parent_section(sections, level) if parent_title: parent_section = next((s for s in sections if s['title'] == parent_title), None) if parent_section: # Count subsections at this level under the same parent subsection_count = sum( 1 for s in sections if s['level'] == level and s.get('parent') == parent_title ) + 1 return f"{parent_section['numbering']}_{subsection_count:02d}" # Fallback numbering return f"{order:02d}" def _create_hierarchical_structure( self, output_dir: Path, sections: List[Dict[str, Any]], options: ExplodeOptions ) -> List[Path]: """ Create the hierarchical directory structure from parsed sections. Args: output_dir: Output directory for the structure sections: Parsed sections with hierarchy information options: Explode options Returns: List of created file paths """ files_created = [] for section in sections: # Generate directory name safe_title = self._sanitize_filename(section['title']) dir_name = f"{section['numbering']}_{safe_title}" # Create section directory section_dir = output_dir / dir_name section_dir.mkdir(exist_ok=True) # Create index.md for this section index_path = section_dir / "index.md" # Process content - extract subsections if any main_content, subsections = self._extract_subsections( section['content'], section['level'] ) # Write main content to index.md index_path.write_text(main_content, encoding='utf-8') files_created.append(index_path) # Create files for subsections for i, subsection in enumerate(subsections, 1): subsection_title = subsection.get('title', f'subsection_{i}') safe_sub_title = self._sanitize_filename(subsection_title) sub_file_name = f"{i:02d}_{safe_sub_title}.md" sub_file_path = section_dir / sub_file_name sub_file_path.write_text(subsection['content'], encoding='utf-8') files_created.append(sub_file_path) return files_created def _extract_subsections(self, content: str, parent_level: int) -> Tuple[str, List[Dict[str, Any]]]: """ Extract subsections from section content. Args: content: Section content parent_level: Level of the parent section Returns: Tuple of (main_content, subsections_list) """ lines = content.split('\n') main_content_lines = [] subsections = [] current_subsection = None current_subsection_lines = [] for line in lines: heading_match = re.match(r'^(#{1,6})\s+(.+)', line) if heading_match: level = len(heading_match.group(1)) title = heading_match.group(2).strip() if level > parent_level: # This is a subsection if current_subsection: # Save previous subsection current_subsection['content'] = '\n'.join(current_subsection_lines) subsections.append(current_subsection) # Start new subsection current_subsection = { 'level': level, 'title': title } current_subsection_lines = [line] elif level <= parent_level: # This is the main section heading or a peer section if level == parent_level: main_content_lines.append(line) else: # Higher-level heading that shouldn't be here in normal parsing main_content_lines.append(line) else: # Regular content line if current_subsection: current_subsection_lines.append(line) else: main_content_lines.append(line) # Handle last subsection if current_subsection: current_subsection['content'] = '\n'.join(current_subsection_lines) subsections.append(current_subsection) main_content = '\n'.join(main_content_lines) return main_content, subsections def _sanitize_filename(self, title: str) -> str: """ Sanitize a title for use as a filename/directory name. Args: title: Original title Returns: Sanitized filename """ # Remove special characters safe_title = re.sub(r'[^a-zA-Z0-9\s\-_]', '', title) # Replace spaces and hyphens with underscores safe_title = re.sub(r'[\s\-]+', '_', safe_title) # Convert to lowercase safe_title = safe_title.lower() # Remove leading/trailing underscores safe_title = safe_title.strip('_') # Limit length if len(safe_title) > 50: safe_title = safe_title[:50].rstrip('_') return safe_title or 'untitled' def _build_structure_entries(self, sections: List[Dict[str, Any]]) -> List[StructureEntry]: """ Build structure entries for manifest from parsed sections. Args: sections: Parsed sections Returns: List of structure entries """ entries = [] for section in sections: safe_title = self._sanitize_filename(section['title']) dir_name = f"{section['numbering']}_{safe_title}" path = f"{dir_name}/index.md" entry = StructureEntry( type=f"h{section['level']}", title=section['title'], path=path, order=section['order'], parent=section.get('parent'), level=section['level'], original_line=section.get('start_line') ) entries.append(entry) return entries def _reconstruct_from_hierarchy( self, input_directory: Path, manifest_data: Any, options: ImplodeOptions ) -> Tuple[str, List[Path]]: """ Reconstruct markdown content from hierarchical directory structure. Args: input_directory: Directory containing hierarchical structure manifest_data: Manifest data if available options: Implode options Returns: Tuple of (reconstructed_content, files_processed) """ content_parts = [] files_processed = [] # Get all directories and sort them properly if manifest_data and hasattr(manifest_data, 'structure'): # Use manifest data to determine proper order subdirs = [] dir_mapping = {} # Create mapping of directory names to Path objects all_dirs = [d for d in input_directory.iterdir() if d.is_dir() and not d.name.startswith('.')] for d in all_dirs: dir_mapping[d.name] = d # Sort manifest entries by original order for entry in sorted(manifest_data.structure, key=lambda x: x.order): dir_name = Path(entry.path).parts[0] if entry.path else "" if dir_name in dir_mapping and dir_mapping[dir_name] not in subdirs: subdirs.append(dir_mapping[dir_name]) # Add any remaining directories not in manifest (fallback) for d in all_dirs: if d not in subdirs: subdirs.append(d) else: # Fallback: sort by numbering prefix, then by name subdirs = sorted([ d for d in input_directory.iterdir() if d.is_dir() and not d.name.startswith('.') ], key=lambda d: ( int(d.name.split('_')[0]) if re.match(r'^\d+_', d.name) else 999, d.name )) for subdir in subdirs: self._process_directory_recursively(subdir, content_parts, files_processed) # Join with appropriate spacing spacing = '\n' * (options.section_spacing + 1) full_content = spacing.join(content_parts) return full_content, files_processed def _process_directory_recursively(self, directory: Path, content_parts: List[str], files_processed: List[Path]): """ Recursively process a directory and its subdirectories for hierarchical content. Args: directory: Directory to process content_parts: List to append content to files_processed: List to append processed files to """ # Read index.md if it exists index_file = directory / "index.md" if index_file.exists(): index_content = index_file.read_text(encoding='utf-8') content_parts.append(index_content) files_processed.append(index_file) # Read other markdown files in this directory md_files = sorted([ f for f in directory.glob("*.md") if f.name != "index.md" ], key=lambda f: f.name) for md_file in md_files: file_content = md_file.read_text(encoding='utf-8') content_parts.append(file_content) files_processed.append(md_file) # Recursively process subdirectories subdirs = sorted([ d for d in directory.iterdir() if d.is_dir() and not d.name.startswith('.') ], key=lambda d: ( int(d.name.split('_')[0]) if re.match(r'^\d+_', d.name) else 999, d.name )) for subdir in subdirs: self._process_directory_recursively(subdir, content_parts, files_processed)