markitect-main/markitect/explode_variants/variant_detector.py

"""
Variant detection utilities for auto-detecting explode variants.

This module analyzes directory structures to determine which variant was
used during explosion, enabling automatic implode operations.
"""

import re
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from dataclasses import dataclass

from .enums import ExplodeVariant, DetectionConfidence
from .manifest_manager import ManifestManager, ManifestData


@dataclass
class DetectionResult:
    """Result of variant detection analysis."""

    variant: Optional[ExplodeVariant]
    confidence: DetectionConfidence
    score: float
    evidence: List[str]
    manifest_found: bool
    manifest_data: Optional[ManifestData] = None


class VariantDetector:
    """
    Detects explode variants from directory structures.

    Uses multiple detection strategies:
    1. Manifest file analysis (highest confidence)
    2. Directory naming pattern recognition
    3. Semantic directory structure analysis
    4. File organization heuristics
    """

    def __init__(self):
        """Initialize the variant detector."""
        self.manifest_manager = ManifestManager()

    def detect_variant(self, directory: Path) -> DetectionResult:
        """
        Detect the explode variant used for a directory structure.

        Args:
            directory: Path to the exploded directory to analyze

        Returns:
            Detection result with variant, confidence, and evidence
        """
        if not directory.exists() or not directory.is_dir():
            return DetectionResult(
                variant=None,
                confidence=DetectionConfidence.UNKNOWN,
                score=0.0,
                evidence=["Directory does not exist or is not a directory"],
                manifest_found=False
            )

        # Strategy 1: Check for manifest file (highest priority)
        manifest_result = self._detect_from_manifest(directory)
        if manifest_result.manifest_found and manifest_result.variant:
            return manifest_result

        # Strategy 2: Pattern-based detection
        pattern_result = self._detect_from_patterns(directory)

        # Strategy 3: Semantic analysis
        semantic_result = self._detect_from_semantics(directory)

        # Combine results and return best match
        return self._combine_detection_results([
            manifest_result,
            pattern_result,
            semantic_result
        ])

    def _detect_from_manifest(self, directory: Path) -> DetectionResult:
        """
        Detect variant from manifest file.

        Args:
            directory: Directory to check for manifest

        Returns:
            Detection result based on manifest analysis
        """
        manifest_data = self.manifest_manager.read_manifest(directory)

        if not manifest_data:
            return DetectionResult(
                variant=None,
                confidence=DetectionConfidence.UNKNOWN,
                score=0.0,
                evidence=["No manifest.md file found"],
                manifest_found=False
            )

        try:
            variant = ExplodeVariant(manifest_data.explosion_type)
            return DetectionResult(
                variant=variant,
                confidence=DetectionConfidence.HIGH,
                score=1.0,
                evidence=[f"Manifest indicates {variant.value} variant"],
                manifest_found=True,
                manifest_data=manifest_data
            )
        except ValueError:
            return DetectionResult(
                variant=None,
                confidence=DetectionConfidence.LOW,
                score=0.1,
                evidence=[f"Invalid variant in manifest: {manifest_data.explosion_type}"],
                manifest_found=True,
                manifest_data=manifest_data
            )

    def _detect_from_patterns(self, directory: Path) -> DetectionResult:
        """
        Detect variant from directory naming patterns.

        Args:
            directory: Directory to analyze

        Returns:
            Detection result based on naming patterns
        """
        subdirs = [d for d in directory.iterdir() if d.is_dir()]
        evidence = []
        scores = {variant: 0.0 for variant in ExplodeVariant}

        # Count numbered prefixes (hierarchical indicator)
        numbered_dirs = 0
        for subdir in subdirs:
            if re.match(r'^\d+_', subdir.name):
                numbered_dirs += 1

        if numbered_dirs > 0:
            ratio = numbered_dirs / len(subdirs) if subdirs else 0
            scores[ExplodeVariant.HIERARCHICAL] += ratio * 0.8
            evidence.append(f"Found {numbered_dirs}/{len(subdirs)} directories with numbered prefixes")

        # Check for semantic directory names
        semantic_indicators = ['parts', 'chapters', 'sections', 'appendices', 'references']
        semantic_matches = 0
        for subdir in subdirs:
            if any(indicator in subdir.name.lower() for indicator in semantic_indicators):
                semantic_matches += 1

        if semantic_matches > 0:
            scores[ExplodeVariant.SEMANTIC] += (semantic_matches / len(subdirs)) * 0.7
            evidence.append(f"Found {semantic_matches} semantic directory names")

        # Default to flat if no strong patterns
        if max(scores.values()) < 0.3:
            scores[ExplodeVariant.FLAT] = 0.6
            evidence.append("No strong hierarchical or semantic patterns detected")

        # Determine best match
        best_variant = max(scores.keys(), key=lambda k: scores[k])
        best_score = scores[best_variant]

        confidence = DetectionConfidence.HIGH if best_score > 0.7 else \
                    DetectionConfidence.MEDIUM if best_score > 0.4 else \
                    DetectionConfidence.LOW

        return DetectionResult(
            variant=best_variant,
            confidence=confidence,
            score=best_score,
            evidence=evidence,
            manifest_found=False
        )

    def _detect_from_semantics(self, directory: Path) -> DetectionResult:
        """
        Detect variant from semantic analysis of content organization.

        Args:
            directory: Directory to analyze

        Returns:
            Detection result based on semantic analysis
        """
        evidence = []
        scores = {variant: 0.0 for variant in ExplodeVariant}

        # Analyze directory depth and organization
        max_depth = self._calculate_max_depth(directory)
        total_dirs = len(list(directory.glob("**/")))

        evidence.append(f"Maximum depth: {max_depth}, Total directories: {total_dirs}")

        # Deep nesting suggests hierarchical
        if max_depth > 3:
            scores[ExplodeVariant.HIERARCHICAL] += 0.6
            evidence.append("Deep nesting suggests hierarchical organization")

        # Analyze file distribution
        md_files = list(directory.glob("**/*.md"))
        if md_files:
            # Exclude manifest from count
            content_files = [f for f in md_files if f.name != "manifest.md"]

            # Many files at root level suggests flat
            root_files = [f for f in content_files if f.parent == directory]
            if len(root_files) > len(content_files) * 0.6:
                scores[ExplodeVariant.FLAT] += 0.5
                evidence.append("Many files at root level suggests flat organization")

        # Check for index.md files (hierarchical indicator)
        index_files = list(directory.glob("**/index.md"))
        if len(index_files) > 2:  # More than just root index
            scores[ExplodeVariant.HIERARCHICAL] += 0.4
            evidence.append(f"Found {len(index_files)} index.md files")

        # Determine best match
        best_variant = max(scores.keys(), key=lambda k: scores[k])
        best_score = scores[best_variant]

        confidence = DetectionConfidence.MEDIUM if best_score > 0.5 else \
                    DetectionConfidence.LOW

        return DetectionResult(
            variant=best_variant,
            confidence=confidence,
            score=best_score,
            evidence=evidence,
            manifest_found=False
        )

    def _combine_detection_results(self, results: List[DetectionResult]) -> DetectionResult:
        """
        Combine multiple detection results into a single best result.

        Args:
            results: List of detection results to combine

        Returns:
            Combined detection result
        """
        # If we have a manifest result, prioritize it
        manifest_result = next((r for r in results if r.manifest_found), None)
        if manifest_result and manifest_result.variant:
            return manifest_result

        # Otherwise find result with highest score (ignoring manifest results without variants)
        non_manifest_results = [r for r in results if not r.manifest_found]
        if non_manifest_results:
            best_result = max(non_manifest_results, key=lambda r: r.score)
            if best_result.score > 0:
                return best_result

        # Fallback to flat variant if no good detection
        return DetectionResult(
            variant=ExplodeVariant.FLAT,
            confidence=DetectionConfidence.LOW,
            score=0.1,
            evidence=["No clear patterns detected, defaulting to flat variant"],
            manifest_found=False
        )

    def _calculate_max_depth(self, directory: Path) -> int:
        """
        Calculate the maximum depth of subdirectories.

        Args:
            directory: Directory to analyze

        Returns:
            Maximum depth (root = 0)
        """
        max_depth = 0
        for path in directory.glob("**/"):
            try:
                depth = len(path.relative_to(directory).parts)
                max_depth = max(max_depth, depth)
            except ValueError:
                continue
        return max_depth

    def is_exploded_directory(self, directory: Path) -> bool:
        """
        Check if a directory appears to be an exploded markdown structure.

        Args:
            directory: Directory to check

        Returns:
            True if directory appears to be exploded markdown content
        """
        if not directory.exists() or not directory.is_dir():
            return False

        # Check for manifest file
        if (directory / "manifest.md").exists():
            return True

        # Check for markdown files
        md_files = list(directory.glob("**/*.md"))
        if not md_files:
            return False

        # Check for typical exploded patterns
        subdirs = [d for d in directory.iterdir() if d.is_dir()]

        # Look for index.md files
        if any((d / "index.md").exists() for d in subdirs):
            return True

        # Look for numbered directories
        if any(re.match(r'^\d+_', d.name) for d in subdirs):
            return True

        # Look for semantic directories
        semantic_names = ['parts', 'chapters', 'sections']
        if any(any(name in d.name.lower() for name in semantic_names) for d in subdirs):
            return True

        # If we have multiple markdown files in organized subdirectories
        if len(md_files) > 2 and len(subdirs) > 1:
            return True

        return False