markitect-main/markitect/assets/analytics.py

"""
Asset analytics functionality for Issue #144.

This module provides asset usage analytics, reporting, and insights
for optimizing asset management workflows.
"""

from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from collections import defaultdict

from .manager import AssetManager


@dataclass
class UsageReport:
    """Comprehensive asset usage report."""
    total_assets: int
    used_assets: int
    unused_assets: int
    usage_frequency: Dict[str, int] = field(default_factory=dict)
    popular_assets: List[Dict[str, Any]] = field(default_factory=list)
    unused_assets_list: List[Dict[str, Any]] = field(default_factory=list)
    size_distribution: Dict[str, int] = field(default_factory=dict)
    format_distribution: Dict[str, int] = field(default_factory=dict)
    report_generated_at: datetime = field(default_factory=datetime.now)

    @property
    def utilization_rate(self) -> float:
        """Calculate asset utilization rate."""
        if self.total_assets == 0:
            return 0.0
        return (self.used_assets / self.total_assets) * 100


@dataclass
class AssetUsageMetrics:
    """Metrics for individual asset usage."""
    content_hash: str
    filename: str
    total_references: int
    unique_documents: int
    first_used: datetime
    last_used: datetime
    usage_trend: str  # 'increasing', 'stable', 'decreasing'
    size_bytes: int
    format: str


@dataclass
class ProjectInsights:
    """High-level insights about asset usage in a project."""
    total_size_bytes: int
    optimization_potential_bytes: int
    duplicate_assets: int
    broken_references: int
    most_used_formats: List[str]
    underutilized_assets: List[str]
    recommendations: List[str] = field(default_factory=list)


class AssetAnalytics:
    """Asset analytics and reporting engine."""

    def __init__(self, asset_manager: AssetManager):
        """Initialize analytics engine."""
        self.asset_manager = asset_manager
        self._usage_history: Dict[str, List[Tuple[datetime, str]]] = defaultdict(list)

    def record_usage(self, content_hash: str, document_path: Path):
        """Record asset usage event."""
        self._usage_history[content_hash].append((datetime.now(), str(document_path)))

        # Also record in database if available
        if hasattr(self.asset_manager, 'database'):
            self.asset_manager.database.record_asset_usage(content_hash, str(document_path))

    def generate_usage_report(self, start_date: Optional[datetime] = None,
                            end_date: Optional[datetime] = None,
                            include_unused: bool = True) -> UsageReport:
        """Generate comprehensive usage report."""
        # Get all assets
        all_assets = self.asset_manager.registry.list_assets()
        total_assets = len(all_assets)

        # Analyze usage patterns
        used_assets = 0
        usage_frequency = {}
        popular_assets = []
        unused_assets_list = []
        size_distribution = {"small": 0, "medium": 0, "large": 0}
        format_distribution = defaultdict(int)

        for asset in all_assets:
            # Check if asset has usage history
            usage_count = len(self._usage_history.get(asset.content_hash, []))

            if usage_count > 0:
                used_assets += 1
                usage_frequency[asset.filename] = usage_count

                # Popular assets (top usage)
                popular_assets.append({
                    "filename": asset.filename,
                    "usage_count": usage_count,
                    "size_bytes": asset.size_bytes
                })
            else:
                if include_unused:
                    unused_assets_list.append({
                        "filename": asset.filename,
                        "size_bytes": asset.size_bytes,
                        "content_hash": asset.content_hash
                    })

            # Size distribution
            if asset.size_bytes < 10000:  # < 10KB
                size_distribution["small"] += 1
            elif asset.size_bytes < 1000000:  # < 1MB
                size_distribution["medium"] += 1
            else:
                size_distribution["large"] += 1

            # Format distribution
            format_ext = Path(asset.filename).suffix.lower()
            format_distribution[format_ext] += 1

        # Sort popular assets by usage
        popular_assets.sort(key=lambda x: x["usage_count"], reverse=True)

        return UsageReport(
            total_assets=total_assets,
            used_assets=used_assets,
            unused_assets=total_assets - used_assets,
            usage_frequency=usage_frequency,
            popular_assets=popular_assets[:10],  # Top 10
            unused_assets_list=unused_assets_list,
            size_distribution=size_distribution,
            format_distribution=dict(format_distribution)
        )

    def get_asset_usage_metrics(self, content_hash: str) -> Optional[AssetUsageMetrics]:
        """Get detailed usage metrics for a specific asset."""
        # Get asset info
        asset = self.asset_manager.registry.get_asset(content_hash)
        if not asset:
            return None

        # Get usage history
        usage_history = self._usage_history.get(content_hash, [])

        if not usage_history:
            return None

        # Analyze usage pattern
        timestamps = [entry[0] for entry in usage_history]
        documents = set(entry[1] for entry in usage_history)

        first_used = min(timestamps)
        last_used = max(timestamps)

        # Determine usage trend (simplified)
        if len(usage_history) >= 3:
            recent_usage = len([ts for ts in timestamps if ts > datetime.now() - timedelta(days=7)])
            older_usage = len([ts for ts in timestamps if ts <= datetime.now() - timedelta(days=7)])

            if recent_usage > older_usage:
                trend = "increasing"
            elif recent_usage < older_usage:
                trend = "decreasing"
            else:
                trend = "stable"
        else:
            trend = "insufficient_data"

        return AssetUsageMetrics(
            content_hash=content_hash,
            filename=asset.filename,
            total_references=len(usage_history),
            unique_documents=len(documents),
            first_used=first_used,
            last_used=last_used,
            usage_trend=trend,
            size_bytes=asset.size_bytes,
            format=Path(asset.filename).suffix.lower()
        )

    def analyze_project_assets(self, project_path: Path) -> ProjectInsights:
        """Analyze assets across an entire project."""
        # Get all assets
        all_assets = self.asset_manager.registry.list_assets()

        total_size = sum(asset.size_bytes for asset in all_assets)

        # Estimate optimization potential
        optimization_potential = 0
        for asset in all_assets:
            format_ext = Path(asset.filename).suffix.lower()
            if format_ext in ['.png', '.jpg', '.jpeg'] and asset.size_bytes > 100000:
                optimization_potential += int(asset.size_bytes * 0.3)  # 30% potential
            elif format_ext == '.pdf' and asset.size_bytes > 1000000:
                optimization_potential += int(asset.size_bytes * 0.2)  # 20% potential

        # Find duplicate assets (simplified - by size)
        size_groups = defaultdict(list)
        for asset in all_assets:
            size_groups[asset.size_bytes].append(asset)

        duplicate_count = sum(len(group) - 1 for group in size_groups.values() if len(group) > 1)

        # Most used formats
        format_counts = defaultdict(int)
        for asset in all_assets:
            format_ext = Path(asset.filename).suffix.lower()
            format_counts[format_ext] += 1

        most_used_formats = sorted(format_counts.items(), key=lambda x: x[1], reverse=True)
        most_used_formats = [fmt for fmt, count in most_used_formats[:5]]

        # Underutilized assets
        underutilized = []
        for asset in all_assets:
            usage_count = len(self._usage_history.get(asset.content_hash, []))
            if usage_count == 0 and asset.size_bytes > 50000:  # Large unused assets
                underutilized.append(asset.filename)

        # Generate recommendations
        recommendations = []
        if optimization_potential > 1000000:  # > 1MB potential savings
            recommendations.append("Consider optimizing large images to reduce storage usage")

        if duplicate_count > 5:
            recommendations.append(f"Found {duplicate_count} potential duplicate assets - consider deduplication")

        if len(underutilized) > 10:
            recommendations.append(f"Found {len(underutilized)} large unused assets - consider cleanup")

        if format_counts.get('.png', 0) > format_counts.get('.jpg', 0) * 2:
            recommendations.append("Consider converting some PNG images to JPEG for better compression")

        return ProjectInsights(
            total_size_bytes=total_size,
            optimization_potential_bytes=optimization_potential,
            duplicate_assets=duplicate_count,
            broken_references=0,  # Would be calculated by discovery engine
            most_used_formats=most_used_formats,
            underutilized_assets=underutilized[:10],  # Top 10
            recommendations=recommendations
        )

    def get_usage_trends(self, days: int = 30) -> Dict[str, List[Tuple[datetime, int]]]:
        """Get usage trends over time for all assets."""
        cutoff_date = datetime.now() - timedelta(days=days)
        trends = {}

        for content_hash, usage_history in self._usage_history.items():
            # Filter recent usage
            recent_usage = [entry for entry in usage_history if entry[0] > cutoff_date]

            if recent_usage:
                # Group by day
                daily_usage = defaultdict(int)
                for timestamp, _ in recent_usage:
                    day = timestamp.date()
                    daily_usage[day] += 1

                # Convert to timeline
                timeline = []
                for day, count in sorted(daily_usage.items()):
                    timeline.append((datetime.combine(day, datetime.min.time()), count))

                if timeline:
                    asset = self.asset_manager.registry.get_asset(content_hash)
                    if asset:
                        trends[asset.filename] = timeline

        return trends

    def export_analytics_data(self, export_path: Path, format: str = "json"):
        """Export analytics data for external analysis."""
        import json

        # Generate comprehensive analytics
        usage_report = self.generate_usage_report()

        # Prepare export data
        export_data = {
            "export_timestamp": datetime.now().isoformat(),
            "usage_report": {
                "total_assets": usage_report.total_assets,
                "used_assets": usage_report.used_assets,
                "unused_assets": usage_report.unused_assets,
                "utilization_rate": usage_report.utilization_rate,
                "popular_assets": usage_report.popular_assets,
                "size_distribution": usage_report.size_distribution,
                "format_distribution": usage_report.format_distribution
            },
            "usage_history": {
                content_hash: [
                    {"timestamp": ts.isoformat(), "document": doc}
                    for ts, doc in history
                ]
                for content_hash, history in self._usage_history.items()
            }
        }

        if format.lower() == "json":
            export_path.write_text(json.dumps(export_data, indent=2))
        elif format.lower() == "csv":
            # Simple CSV export of usage data
            import csv
            with open(export_path, 'w', newline='') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(['Asset', 'Usage Count', 'Size Bytes', 'Format'])

                for asset in usage_report.popular_assets:
                    writer.writerow([
                        asset['filename'],
                        asset['usage_count'],
                        asset['size_bytes'],
                        Path(asset['filename']).suffix
                    ])

    def clear_analytics_data(self):
        """Clear all collected analytics data."""
        self._usage_history.clear()