Files
markitect-main/markitect/assets/analytics.py
tegwick 567f01121e
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
feat: complete Issue #146 final integration testing
Fixed all remaining test failures in test_issue_146_final_integration.py
achieving 100% test success rate (9/9 tests passing):

- Fixed performance monitoring metrics access patterns
- Resolved AssetManager constructor parameter handling
- Implemented missing CLI command methods (add_asset, list_assets, get_asset_info)
- Added cross-platform symlink creation method aliases
- Fixed asset deduplication content uniqueness issues
- Resolved production deployment asset removal workflows
- Fixed performance benchmark dict/hash type conflicts

The asset management system is now production-ready with comprehensive
integration test coverage validating all major workflows and edge cases.

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-15 00:19:52 +02:00

329 lines
12 KiB
Python

"""
Asset analytics functionality for Issue #144.
This module provides asset usage analytics, reporting, and insights
for optimizing asset management workflows.
"""
from pathlib import Path
from typing import Dict, Any, List, Optional, Tuple
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from collections import defaultdict
from .manager import AssetManager
@dataclass
class UsageReport:
"""Comprehensive asset usage report."""
total_assets: int
used_assets: int
unused_assets: int
usage_frequency: Dict[str, int] = field(default_factory=dict)
popular_assets: List[Dict[str, Any]] = field(default_factory=list)
unused_assets_list: List[Dict[str, Any]] = field(default_factory=list)
size_distribution: Dict[str, int] = field(default_factory=dict)
format_distribution: Dict[str, int] = field(default_factory=dict)
report_generated_at: datetime = field(default_factory=datetime.now)
@property
def utilization_rate(self) -> float:
"""Calculate asset utilization rate."""
if self.total_assets == 0:
return 0.0
return (self.used_assets / self.total_assets) * 100
@dataclass
class AssetUsageMetrics:
"""Metrics for individual asset usage."""
content_hash: str
filename: str
total_references: int
unique_documents: int
first_used: datetime
last_used: datetime
usage_trend: str # 'increasing', 'stable', 'decreasing'
size_bytes: int
format: str
@dataclass
class ProjectInsights:
"""High-level insights about asset usage in a project."""
total_size_bytes: int
optimization_potential_bytes: int
duplicate_assets: int
broken_references: int
most_used_formats: List[str]
underutilized_assets: List[str]
recommendations: List[str] = field(default_factory=list)
class AssetAnalytics:
"""Asset analytics and reporting engine."""
def __init__(self, asset_manager: AssetManager):
"""Initialize analytics engine."""
self.asset_manager = asset_manager
self._usage_history: Dict[str, List[Tuple[datetime, str]]] = defaultdict(list)
def record_usage(self, content_hash: str, document_path: Path):
"""Record asset usage event."""
self._usage_history[content_hash].append((datetime.now(), str(document_path)))
# Also record in database if available
if hasattr(self.asset_manager, 'database'):
self.asset_manager.database.record_asset_usage(content_hash, str(document_path))
def generate_usage_report(self, start_date: Optional[datetime] = None,
end_date: Optional[datetime] = None,
include_unused: bool = True) -> UsageReport:
"""Generate comprehensive usage report."""
# Get all assets
all_assets = self.asset_manager.registry.list_assets_as_objects()
total_assets = len(all_assets)
# Analyze usage patterns
used_assets = 0
usage_frequency = {}
popular_assets = []
unused_assets_list = []
size_distribution = {"small": 0, "medium": 0, "large": 0}
format_distribution = defaultdict(int)
for asset in all_assets:
# Check if asset has usage history
usage_count = len(self._usage_history.get(asset.content_hash, []))
if usage_count > 0:
used_assets += 1
# Use filename from Asset object
usage_frequency[asset.filename] = usage_count
# Popular assets (top usage)
popular_assets.append({
"filename": asset.filename,
"usage_count": usage_count,
"size_bytes": asset.size_bytes
})
else:
if include_unused:
unused_assets_list.append({
"filename": asset.filename,
"size_bytes": asset.size_bytes,
"content_hash": asset.content_hash
})
# Size distribution
if asset.size_bytes < 10000: # < 10KB
size_distribution["small"] += 1
elif asset.size_bytes < 1000000: # < 1MB
size_distribution["medium"] += 1
else:
size_distribution["large"] += 1
# Format distribution
format_ext = Path(asset.filename).suffix.lower()
format_distribution[format_ext] += 1
# Sort popular assets by usage
popular_assets.sort(key=lambda x: x["usage_count"], reverse=True)
return UsageReport(
total_assets=total_assets,
used_assets=used_assets,
unused_assets=total_assets - used_assets,
usage_frequency=usage_frequency,
popular_assets=popular_assets[:10], # Top 10
unused_assets_list=unused_assets_list,
size_distribution=size_distribution,
format_distribution=dict(format_distribution)
)
def get_asset_usage_metrics(self, content_hash: str) -> Optional[AssetUsageMetrics]:
"""Get detailed usage metrics for a specific asset."""
# Get asset info
asset = self.asset_manager.registry.get_asset_as_object(content_hash)
if not asset:
return None
# Get usage history
usage_history = self._usage_history.get(content_hash, [])
if not usage_history:
return None
# Analyze usage pattern
timestamps = [entry[0] for entry in usage_history]
documents = set(entry[1] for entry in usage_history)
first_used = min(timestamps)
last_used = max(timestamps)
# Determine usage trend (simplified)
if len(usage_history) >= 3:
recent_usage = len([ts for ts in timestamps if ts > datetime.now() - timedelta(days=7)])
older_usage = len([ts for ts in timestamps if ts <= datetime.now() - timedelta(days=7)])
if recent_usage > older_usage:
trend = "increasing"
elif recent_usage < older_usage:
trend = "decreasing"
else:
trend = "stable"
else:
trend = "insufficient_data"
return AssetUsageMetrics(
content_hash=content_hash,
filename=asset.filename,
total_references=len(usage_history),
unique_documents=len(documents),
first_used=first_used,
last_used=last_used,
usage_trend=trend,
size_bytes=asset.size_bytes,
format=Path(asset.filename).suffix.lower()
)
def analyze_project_assets(self, project_path: Path) -> ProjectInsights:
"""Analyze assets across an entire project."""
# Get all assets
all_assets = self.asset_manager.registry.list_assets_as_objects()
total_size = sum(asset.size_bytes for asset in all_assets)
# Estimate optimization potential
optimization_potential = 0
for asset in all_assets:
format_ext = Path(asset.filename).suffix.lower()
if format_ext in ['.png', '.jpg', '.jpeg'] and asset.size_bytes > 100000:
optimization_potential += int(asset.size_bytes * 0.3) # 30% potential
elif format_ext == '.pdf' and asset.size_bytes > 1000000:
optimization_potential += int(asset.size_bytes * 0.2) # 20% potential
# Find duplicate assets (simplified - by size)
size_groups = defaultdict(list)
for asset in all_assets:
size_groups[asset.size_bytes].append(asset)
duplicate_count = sum(len(group) - 1 for group in size_groups.values() if len(group) > 1)
# Most used formats
format_counts = defaultdict(int)
for asset in all_assets:
format_ext = Path(asset.filename).suffix.lower()
format_counts[format_ext] += 1
most_used_formats = sorted(format_counts.items(), key=lambda x: x[1], reverse=True)
most_used_formats = [fmt for fmt, count in most_used_formats[:5]]
# Underutilized assets
underutilized = []
for asset in all_assets:
usage_count = len(self._usage_history.get(asset.content_hash, []))
if usage_count == 0 and asset.size_bytes > 50000: # Large unused assets
underutilized.append(asset.filename)
# Generate recommendations
recommendations = []
if optimization_potential > 1000000: # > 1MB potential savings
recommendations.append("Consider optimizing large images to reduce storage usage")
if duplicate_count > 5:
recommendations.append(f"Found {duplicate_count} potential duplicate assets - consider deduplication")
if len(underutilized) > 10:
recommendations.append(f"Found {len(underutilized)} large unused assets - consider cleanup")
if format_counts.get('.png', 0) > format_counts.get('.jpg', 0) * 2:
recommendations.append("Consider converting some PNG images to JPEG for better compression")
return ProjectInsights(
total_size_bytes=total_size,
optimization_potential_bytes=optimization_potential,
duplicate_assets=duplicate_count,
broken_references=0, # Would be calculated by discovery engine
most_used_formats=most_used_formats,
underutilized_assets=underutilized[:10], # Top 10
recommendations=recommendations
)
def get_usage_trends(self, days: int = 30) -> Dict[str, List[Tuple[datetime, int]]]:
"""Get usage trends over time for all assets."""
cutoff_date = datetime.now() - timedelta(days=days)
trends = {}
for content_hash, usage_history in self._usage_history.items():
# Filter recent usage
recent_usage = [entry for entry in usage_history if entry[0] > cutoff_date]
if recent_usage:
# Group by day
daily_usage = defaultdict(int)
for timestamp, _ in recent_usage:
day = timestamp.date()
daily_usage[day] += 1
# Convert to timeline
timeline = []
for day, count in sorted(daily_usage.items()):
timeline.append((datetime.combine(day, datetime.min.time()), count))
if timeline:
asset = self.asset_manager.registry.get_asset_as_object(content_hash)
if asset:
trends[asset.filename] = timeline
return trends
def export_analytics_data(self, export_path: Path, format: str = "json"):
"""Export analytics data for external analysis."""
import json
# Generate comprehensive analytics
usage_report = self.generate_usage_report()
# Prepare export data
export_data = {
"export_timestamp": datetime.now().isoformat(),
"usage_report": {
"total_assets": usage_report.total_assets,
"used_assets": usage_report.used_assets,
"unused_assets": usage_report.unused_assets,
"utilization_rate": usage_report.utilization_rate,
"popular_assets": usage_report.popular_assets,
"size_distribution": usage_report.size_distribution,
"format_distribution": usage_report.format_distribution
},
"usage_history": {
content_hash: [
{"timestamp": ts.isoformat(), "document": doc}
for ts, doc in history
]
for content_hash, history in self._usage_history.items()
}
}
if format.lower() == "json":
export_path.write_text(json.dumps(export_data, indent=2))
elif format.lower() == "csv":
# Simple CSV export of usage data
import csv
with open(export_path, 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['Asset', 'Usage Count', 'Size Bytes', 'Format'])
for asset in usage_report.popular_assets:
writer.writerow([
asset['filename'],
asset['usage_count'],
asset['size_bytes'],
Path(asset['filename']).suffix
])
def clear_analytics_data(self):
"""Clear all collected analytics data."""
self._usage_history.clear()