markitect-main/markitect/explode_variants/semantic_variant.py

"""
Semantic variant implementation for explode-implode operations.

This variant creates content-based directory groupings that reflect the
semantic structure of the document, organizing by meaning rather than order.
"""

import re
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple, Set

from .base_variant import (
    BaseVariant, ExplodeOptions, ImplodeOptions,
    ExplodeResult, ImplodeResult
)
from .enums import ExplodeVariant
from .manifest_manager import ManifestManager, StructureEntry
from ..matter_frontmatter.parser import FrontmatterParser


class SemanticVariant(BaseVariant):
    """
    Semantic variant implementation.

    Creates content-based directory groupings that organize content by
    semantic meaning rather than document order. Groups related content
    together based on keywords and content analysis.

    Structure example:
    book.mdd/
    ├── manifest.md
    ├── introduction/
    │   ├── overview.md
    │   ├── scope.md
    │   └── objectives.md
    ├── chapters/
    │   ├── fundamentals.md
    │   ├── advanced_topics.md
    │   └── case_studies.md
    ├── appendices/
    │   ├── references.md
    │   ├── glossary.md
    │   └── index.md
    └── conclusion/
        └── summary.md
    """

    # Semantic group definitions
    SEMANTIC_GROUPS = {
        'introduction': {
            'keywords': ['introduction', 'overview', 'preface', 'foreword', 'abstract',
                        'summary', 'about', 'welcome', 'getting started'],
            'patterns': [r'intro', r'begin', r'start', r'overview'],
            'order': 1
        },
        'chapters': {
            'keywords': ['chapter', 'section', 'part', 'topic', 'lesson', 'content',
                        'main', 'core', 'body', 'details'],
            'patterns': [r'chapter\s*\d+', r'part\s*\d+', r'section\s*\d+'],
            'order': 2
        },
        'tutorials': {
            'keywords': ['tutorial', 'guide', 'howto', 'how-to', 'walkthrough',
                        'example', 'demo', 'practice', 'exercise'],
            'patterns': [r'tutorial', r'guide', r'how\s*to', r'step\s*by\s*step'],
            'order': 3
        },
        'reference': {
            'keywords': ['reference', 'api', 'documentation', 'spec', 'specification',
                        'manual', 'docs', 'command', 'function'],
            'patterns': [r'api', r'reference', r'spec', r'manual'],
            'order': 4
        },
        'appendices': {
            'keywords': ['appendix', 'appendices', 'glossary', 'index', 'bibliography',
                        'references', 'credits', 'acknowledgments', 'notes'],
            'patterns': [r'appendix', r'glossary', r'bibliography'],
            'order': 5
        },
        'conclusion': {
            'keywords': ['conclusion', 'summary', 'final', 'end', 'closing',
                        'wrap-up', 'takeaway', 'results', 'outcome'],
            'patterns': [r'conclusion', r'summary', r'final', r'end'],
            'order': 6
        }
    }

    def __init__(self):
        """Initialize the semantic variant."""
        super().__init__(ExplodeVariant.SEMANTIC)
        self.manifest_manager = ManifestManager()
        self.frontmatter_parser = FrontmatterParser()

    @property
    def name(self) -> str:
        """Human-readable name of the variant."""
        return "Semantic Structure"

    @property
    def description(self) -> str:
        """Description of the variant's behavior."""
        return ("Creates content-based directory groupings that organize content by "
                "semantic meaning. Groups related content together based on keywords "
                "and content analysis.")

    def explode(
        self,
        input_file: Path,
        options: ExplodeOptions
    ) -> ExplodeResult:
        """
        Explode a markdown file using the semantic structure variant.

        Args:
            input_file: Path to the markdown file to explode
            options: Options controlling the explode operation

        Returns:
            Result of the explode operation
        """
        # Validate input
        validation_errors = self.validate_input_file(input_file)
        if validation_errors:
            return ExplodeResult(
                success=False,
                output_directory=options.output_dir or Path(),
                files_created=[],
                manifest_path=None,
                warnings=[],
                errors=validation_errors,
                variant_used=self.variant_type
            )

        # Determine output directory
        if options.output_dir:
            output_dir = options.output_dir
        else:
            suffix = ".mdd" if options.create_manifest else "_exploded"
            output_dir = input_file.parent / f"{input_file.stem}{suffix}"

        # Create output directory
        creation_errors = self.create_output_directory(output_dir, overwrite=True)
        if creation_errors:
            return ExplodeResult(
                success=False,
                output_directory=output_dir,
                files_created=[],
                manifest_path=None,
                warnings=[],
                errors=creation_errors,
                variant_used=self.variant_type
            )

        try:
            # Parse the markdown content
            content = input_file.read_text(encoding='utf-8')

            # Extract and save front matter if present and preservation is enabled
            files_created = []
            if options.preserve_front_matter:
                frontmatter, content_without_fm = self.frontmatter_parser.separate_frontmatter_and_content(content)
                if frontmatter:
                    # Save front matter to _frontmatter.yml
                    import yaml
                    fm_file = output_dir / "_frontmatter.yml"
                    fm_content = yaml.dump(frontmatter, default_flow_style=False)
                    fm_file.write_text(fm_content, encoding='utf-8')
                    files_created.append(fm_file)
                    # Use content without front matter for processing
                    content = content_without_fm

            # Analyze document structure and classify sections semantically
            sections = self._parse_semantic_structure(content)

            # Group sections by semantic meaning
            semantic_groups = self._group_sections_semantically(sections)

            # Create semantic directory structure
            semantic_files = self._create_semantic_structure(
                output_dir, semantic_groups, options
            )

            # Create manifest if requested
            manifest_path = None
            if options.create_manifest:
                structure = self._build_structure_entries(semantic_groups)
                manifest_path = self.manifest_manager.create_manifest(
                    output_dir=output_dir,
                    original_file=input_file,
                    variant=self.variant_type,
                    structure=structure,
                    preservation_options={
                        "front_matter": options.preserve_front_matter,
                        "section_order": True,
                        "heading_levels": True,
                        "semantic_grouping": True
                    }
                )
                semantic_files.append(manifest_path)

            # Combine all created files
            all_files = files_created + semantic_files

            return ExplodeResult(
                success=True,
                output_directory=output_dir,
                files_created=all_files,
                manifest_path=manifest_path,
                warnings=[],
                errors=[],
                variant_used=self.variant_type
            )

        except Exception as e:
            return ExplodeResult(
                success=False,
                output_directory=output_dir,
                files_created=[],
                manifest_path=None,
                warnings=[],
                errors=[f"Error during semantic explosion: {e}"],
                variant_used=self.variant_type
            )

    def implode(
        self,
        input_directory: Path,
        options: ImplodeOptions
    ) -> ImplodeResult:
        """
        Implode a semantic directory structure back into a markdown file.

        Args:
            input_directory: Path to the directory to implode
            options: Options controlling the implode operation

        Returns:
            Result of the implode operation
        """
        # Validate input
        validation_errors = self.validate_input_directory(input_directory)
        if validation_errors:
            return ImplodeResult(
                success=False,
                output_file=options.output_file or Path(),
                files_processed=[],
                variant_detected=self.variant_type,
                warnings=[],
                errors=validation_errors
            )

        # Determine output file
        if options.output_file:
            output_file = options.output_file
        else:
            output_file = input_directory.parent / f"{input_directory.name}_imploded.md"

        try:
            # Read manifest if available
            manifest_data = self.manifest_manager.read_manifest(input_directory)

            # Reconstruct content from semantic structure
            content, files_processed = self._reconstruct_from_semantics(
                input_directory, manifest_data, options
            )

            # Add front matter if present and preservation is enabled
            if options.preserve_front_matter:
                fm_file = input_directory / '_frontmatter.yml'
                if fm_file.exists():
                    try:
                        import yaml
                        frontmatter_content = fm_file.read_text(encoding='utf-8').strip()
                        content = f"---\n{frontmatter_content}\n---\n\n{content}"
                    except Exception:
                        pass  # Ignore errors reading front matter

            # Write output file
            if not options.dry_run:
                output_file.write_text(content, encoding='utf-8')

            return ImplodeResult(
                success=True,
                output_file=output_file,
                files_processed=files_processed,
                variant_detected=self.variant_type,
                warnings=[],
                errors=[]
            )

        except Exception as e:
            return ImplodeResult(
                success=False,
                output_file=output_file,
                files_processed=[],
                variant_detected=self.variant_type,
                warnings=[],
                errors=[f"Error during semantic implosion: {e}"]
            )

    def can_handle_directory(self, directory: Path) -> bool:
        """
        Check if this variant can handle the given directory structure.

        Args:
            directory: Path to the directory to check

        Returns:
            True if this variant can handle the directory
        """
        if not directory.exists() or not directory.is_dir():
            return False

        # Check for manifest indicating semantic variant
        manifest_data = self.manifest_manager.read_manifest(directory)
        if manifest_data and manifest_data.explosion_type == "semantic":
            return True

        # Check for semantic directory patterns
        subdirs = [d for d in directory.iterdir() if d.is_dir()]

        # Look for semantic directory names
        semantic_names = set()
        for group_name, group_data in self.SEMANTIC_GROUPS.items():
            semantic_names.update(group_data['keywords'])

        semantic_matches = 0
        for subdir in subdirs:
            dir_name_lower = subdir.name.lower()
            if any(keyword in dir_name_lower for keyword in semantic_names):
                semantic_matches += 1

        # High ratio of semantic directories indicates semantic structure
        return (semantic_matches / len(subdirs) if subdirs else 0) > 0.4

    def get_detection_patterns(self) -> Dict[str, Any]:
        """
        Get patterns used for auto-detecting this variant.

        Returns:
            Dictionary of detection patterns and weights
        """
        return {
            "manifest_type": "semantic",
            "semantic_directory_ratio": {"min": 0.4, "weight": 0.7},
            "keyword_matches": {"weight": 0.6},
            "numbered_directory_ratio": {"max": 0.2, "weight": 0.4},
            "semantic_patterns": {"weight": 0.8}
        }

    def _parse_semantic_structure(self, content: str) -> List[Dict[str, Any]]:
        """
        Parse markdown content into sections with semantic analysis.

        Args:
            content: Markdown content to parse

        Returns:
            List of section dictionaries with semantic information
        """
        sections = []
        lines = content.split('\n')
        current_section = None
        current_content = []
        section_counter = 1

        for i, line in enumerate(lines):
            # Check for headings
            heading_match = re.match(r'^(#{1,6})\s+(.+)', line)

            if heading_match:
                # Save previous section
                if current_section:
                    current_section['content'] = '\n'.join(current_content)
                    current_section['end_line'] = i
                    # Analyze semantic meaning
                    current_section['semantic_info'] = self._analyze_semantic_meaning(
                        current_section['title'],
                        current_section['content']
                    )
                    sections.append(current_section)

                # Start new section
                level = len(heading_match.group(1))
                title = heading_match.group(2).strip()

                current_section = {
                    'level': level,
                    'title': title,
                    'start_line': i + 1,
                    'order': section_counter,
                    'parent': self._find_parent_section(sections, level)
                }
                current_content = [line]
                section_counter += 1
            else:
                if current_content:
                    current_content.append(line)

        # Handle last section
        if current_section:
            current_section['content'] = '\n'.join(current_content)
            current_section['end_line'] = len(lines)
            current_section['semantic_info'] = self._analyze_semantic_meaning(
                current_section['title'],
                current_section['content']
            )
            sections.append(current_section)

        return sections

    def _analyze_semantic_meaning(self, title: str, content: str) -> Dict[str, Any]:
        """
        Analyze the semantic meaning of a section.

        Args:
            title: Section title
            content: Section content

        Returns:
            Dictionary with semantic analysis results
        """
        title_lower = title.lower()
        content_lower = content.lower()
        text_combined = f"{title_lower} {content_lower}"

        # Score against each semantic group
        group_scores = {}
        for group_name, group_data in self.SEMANTIC_GROUPS.items():
            score = 0.0

            # Check keyword matches
            for keyword in group_data['keywords']:
                if keyword in title_lower:
                    score += 2.0  # Title matches are weighted higher
                if keyword in content_lower:
                    score += 1.0

            # Check pattern matches
            for pattern in group_data['patterns']:
                if re.search(pattern, text_combined, re.IGNORECASE):
                    score += 1.5

            group_scores[group_name] = score

        # Find best matching group
        best_group = max(group_scores.keys(), key=lambda k: group_scores[k])
        best_score = group_scores[best_group]

        # Additional semantic features
        features = {
            'word_count': len(content.split()),
            'has_code_blocks': '```' in content,
            'has_lists': bool(re.search(r'^\s*[-*+]\s', content, re.MULTILINE)),
            'has_numbered_lists': bool(re.search(r'^\s*\d+\.\s', content, re.MULTILINE)),
            'heading_level_1_count': len(re.findall(r'^#\s', content, re.MULTILINE)),
            'heading_level_2_count': len(re.findall(r'^##\s', content, re.MULTILINE))
        }

        return {
            'best_group': best_group if best_score > 0 else 'chapters',  # Default fallback
            'confidence': min(best_score / 3.0, 1.0),  # Normalize to 0-1
            'group_scores': group_scores,
            'features': features
        }

    def _find_parent_section(self, sections: List[Dict[str, Any]], level: int) -> Optional[str]:
        """
        Find the parent section for the current heading level.

        Args:
            sections: Previously parsed sections
            level: Current heading level

        Returns:
            Parent section title or None
        """
        # Look for the most recent section with a lower level
        for section in reversed(sections):
            if section['level'] < level:
                return section['title']
        return None

    def _group_sections_semantically(self, sections: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
        """
        Group sections by their semantic meaning.

        Args:
            sections: Parsed sections with semantic analysis

        Returns:
            Dictionary of semantic groups containing sections
        """
        groups = {group_name: [] for group_name in self.SEMANTIC_GROUPS.keys()}

        # Add an 'other' group for unclassified content
        groups['other'] = []

        for section in sections:
            semantic_info = section.get('semantic_info', {})
            best_group = semantic_info.get('best_group', 'other')
            confidence = semantic_info.get('confidence', 0.0)

            # Only place in semantic group if confidence is reasonable
            if confidence > 0.2 and best_group in groups:
                groups[best_group].append(section)
            else:
                groups['other'].append(section)

        # Remove empty groups
        return {k: v for k, v in groups.items() if v}

    def _create_semantic_structure(
        self,
        output_dir: Path,
        semantic_groups: Dict[str, List[Dict[str, Any]]],
        options: ExplodeOptions
    ) -> List[Path]:
        """
        Create the semantic directory structure from grouped sections.

        Args:
            output_dir: Output directory for the structure
            semantic_groups: Sections grouped by semantic meaning
            options: Explode options

        Returns:
            List of created file paths
        """
        files_created = []

        # Process groups in semantic order
        group_order = sorted(
            semantic_groups.keys(),
            key=lambda g: self.SEMANTIC_GROUPS.get(g, {}).get('order', 999)
        )

        for group_name in group_order:
            sections = semantic_groups[group_name]
            if not sections:
                continue

            # Create group directory
            group_dir = output_dir / group_name
            group_dir.mkdir(exist_ok=True)

            # Process sections in this group
            for section in sections:
                # Generate filename from title
                safe_title = self._sanitize_filename(section['title'])
                filename = f"{safe_title}.md"

                # Avoid conflicts
                file_path = group_dir / filename
                counter = 1
                while file_path.exists():
                    base_name = safe_title
                    filename = f"{base_name}_{counter}.md"
                    file_path = group_dir / filename
                    counter += 1

                # Write section content
                file_path.write_text(section['content'], encoding='utf-8')
                files_created.append(file_path)

        return files_created

    def _sanitize_filename(self, title: str) -> str:
        """
        Sanitize a title for use as a filename.

        Args:
            title: Original title

        Returns:
            Sanitized filename
        """
        # Remove markdown heading markers
        title = re.sub(r'^#+\s*', '', title)

        # Remove special characters
        safe_title = re.sub(r'[^a-zA-Z0-9\s\-_]', '', title)

        # Replace spaces and hyphens with underscores
        safe_title = re.sub(r'[\s\-]+', '_', safe_title)

        # Convert to lowercase
        safe_title = safe_title.lower()

        # Remove leading/trailing underscores
        safe_title = safe_title.strip('_')

        # Limit length
        if len(safe_title) > 50:
            safe_title = safe_title[:50].rstrip('_')

        return safe_title or 'untitled'

    def _build_structure_entries(self, semantic_groups: Dict[str, List[Dict[str, Any]]]) -> List[StructureEntry]:
        """
        Build structure entries for manifest from semantic groups.

        Args:
            semantic_groups: Sections grouped by semantic meaning

        Returns:
            List of structure entries
        """
        entries = []

        # Collect all sections from all groups and sort by original document order
        all_sections = []
        for group_name, sections in semantic_groups.items():
            for section in sections:
                section['group_name'] = group_name
                all_sections.append(section)

        # Sort by original document order (using the 'order' field from parsing)
        all_sections.sort(key=lambda s: s.get('order', 0))

        # Create structure entries preserving original document order
        for section in all_sections:
            safe_title = self._sanitize_filename(section['title'])
            path = f"{section['group_name']}/{safe_title}.md"

            entry = StructureEntry(
                type=f"h{section['level']}",
                title=section['title'],
                path=path,
                order=section.get('order', 0),  # Use original document order
                parent=section.get('parent'),
                level=section['level'],
                original_line=section.get('start_line')
            )
            entries.append(entry)

        return entries

    def _reconstruct_from_semantics(
        self,
        input_directory: Path,
        manifest_data: Any,
        options: ImplodeOptions
    ) -> Tuple[str, List[Path]]:
        """
        Reconstruct markdown content from semantic directory structure.

        Args:
            input_directory: Directory containing semantic structure
            manifest_data: Manifest data if available
            options: Implode options

        Returns:
            Tuple of (reconstructed_content, files_processed)
        """
        content_parts = []
        files_processed = []

        # Get all directories and files and use manifest order to preserve original structure
        if manifest_data and hasattr(manifest_data, 'structure'):
            # Use manifest data to reconstruct in original document order
            for entry in sorted(manifest_data.structure, key=lambda x: x.order):
                file_path = input_directory / entry.path
                if file_path.exists() and file_path.name != "manifest.md":
                    content = file_path.read_text(encoding='utf-8')
                    content_parts.append(content)
                    files_processed.append(file_path)
        else:
            # Fallback: process directories in semantic order
            subdirs = [d for d in input_directory.iterdir() if d.is_dir()]
            subdirs = sorted(subdirs,
                           key=lambda d: self.SEMANTIC_GROUPS.get(d.name, {}).get('order', 999))

            for subdir in subdirs:
                # Process markdown files in alphabetical order
                md_files = sorted(subdir.glob("*.md"))

                for md_file in md_files:
                    if md_file.name != "manifest.md":
                        content = md_file.read_text(encoding='utf-8')
                        content_parts.append(content)
                        files_processed.append(md_file)

        # Join with appropriate spacing
        spacing = '\n' * (options.section_spacing + 1)
        full_content = spacing.join(content_parts)

        return full_content, files_processed