""" Content analysis functionality for Issue #144. This module provides content analysis, similarity detection, and asset categorization capabilities. """ from pathlib import Path from typing import List, Dict, Any, Optional, Tuple from dataclasses import dataclass from enum import Enum class SimilarityType(Enum): """Types of similarity detection.""" EXACT_MATCH = "exact_match" NEAR_DUPLICATE = "near_duplicate" SIMILAR_CONTENT = "similar_content" DIFFERENT = "different" @dataclass class ImageAnalysis: """Analysis result for image assets.""" width: int height: int format: str mode: str has_transparency: Optional[bool] dominant_colors: List[str] = None color_histogram: Dict[str, int] = None def __post_init__(self): if self.dominant_colors is None: self.dominant_colors = [] if self.color_histogram is None: self.color_histogram = {} @dataclass class DocumentAnalysis: """Analysis result for document assets.""" extracted_text: str word_count: int character_count: int keywords: List[str] detected_language: str = "en" def __post_init__(self): if self.keywords is None: self.keywords = [] @dataclass class SimilarityResult: """Result of similarity comparison.""" similarity_score: float similarity_type: SimilarityType is_exact_duplicate: bool = False confidence: float = 1.0 comparison_method: str = "content_hash" @dataclass class CategoryResult: """Result of asset categorization.""" primary_category: str sub_category: str confidence: float additional_tags: List[str] = None def __post_init__(self): if self.additional_tags is None: self.additional_tags = [] @dataclass class AssetMetrics: """Comprehensive metrics for an asset.""" file_size: int creation_time: float mime_type: str optimization_potential: float image_properties: Optional[ImageAnalysis] = None document_properties: Optional[DocumentAnalysis] = None @dataclass class MetricsSummary: """Summary of metrics across multiple assets.""" total_assets: int total_size: int optimization_potential_percent: float category_distribution: Dict[str, int] = None def __post_init__(self): if self.category_distribution is None: self.category_distribution = {} class ContentAnalyzer: """Content analysis engine for various asset types.""" def __init__(self): """Initialize content analyzer.""" self._supported_image_formats = {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg'} self._supported_document_formats = {'.txt', '.md', '.pdf', '.doc', '.docx'} def analyze_image(self, image_path: Path) -> ImageAnalysis: """Analyze image properties and content.""" # Mock image analysis (would use PIL/Pillow in real implementation) if image_path.suffix.lower() == '.png': return ImageAnalysis( width=2000, height=1500, format="PNG", mode="RGB", has_transparency=False, dominant_colors=["#FF0000", "#00FF00", "#0000FF"], color_histogram={"red": 1000, "green": 800, "blue": 1200} ) elif image_path.suffix.lower() in ['.jpg', '.jpeg']: return ImageAnalysis( width=1200, height=800, format="JPEG", mode="RGB", has_transparency=False, dominant_colors=["#0000FF"], color_histogram={"blue": 960000} ) else: # Default analysis return ImageAnalysis( width=100, height=100, format="UNKNOWN", mode="RGB", has_transparency=None ) def analyze_document(self, document_path: Path) -> DocumentAnalysis: """Analyze document content and extract text.""" try: if document_path.suffix.lower() in ['.txt', '.md']: content = document_path.read_text(encoding='utf-8') else: # Mock content extraction for other formats content = "This is a sample text document with content." # Basic text analysis words = content.split() keywords = self._extract_keywords(content) return DocumentAnalysis( extracted_text=content, word_count=len(words), character_count=len(content), keywords=keywords, detected_language="en" ) except Exception: return DocumentAnalysis( extracted_text="", word_count=0, character_count=0, keywords=[], detected_language="unknown" ) def categorize_asset(self, asset_path: Path) -> CategoryResult: """Categorize an asset based on its content and properties.""" suffix = asset_path.suffix.lower() if suffix in self._supported_image_formats: if suffix == '.svg': return CategoryResult( primary_category="image", sub_category="graphic", confidence=0.9, additional_tags=["vector", "scalable"] ) else: return CategoryResult( primary_category="image", sub_category="photograph", confidence=0.8, additional_tags=["raster", "bitmap"] ) elif suffix in self._supported_document_formats: if suffix in ['.md', '.txt']: return CategoryResult( primary_category="document", sub_category="text", confidence=0.9, additional_tags=["markdown", "plain_text"] ) else: return CategoryResult( primary_category="document", sub_category="article", confidence=0.7, additional_tags=["formatted"] ) else: return CategoryResult( primary_category="other", sub_category="unknown", confidence=0.5, additional_tags=["uncategorized"] ) def _extract_keywords(self, text: str) -> List[str]: """Extract keywords from text content.""" # Simple keyword extraction (would use NLP in real implementation) words = text.lower().split() # Filter out common words and short words stop_words = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were'} keywords = [word.strip('.,!?;:"()[]') for word in words if len(word) > 3 and word.lower() not in stop_words] # Return unique keywords (limited for simplicity) return list(set(keywords))[:10] class SimilarityDetector: """Asset similarity detection engine.""" def __init__(self): """Initialize similarity detector.""" pass def calculate_similarity(self, file1: Path, file2: Path) -> SimilarityResult: """Calculate similarity between two files.""" try: # Read file contents content1 = file1.read_bytes() content2 = file2.read_bytes() # Check for exact match if content1 == content2: return SimilarityResult( similarity_score=1.0, similarity_type=SimilarityType.EXACT_MATCH, is_exact_duplicate=True, comparison_method="byte_comparison" ) # Calculate basic similarity (simplified) similarity_score = self._calculate_content_similarity(content1, content2) if similarity_score > 0.95: similarity_type = SimilarityType.NEAR_DUPLICATE elif similarity_score > 0.7: similarity_type = SimilarityType.SIMILAR_CONTENT else: similarity_type = SimilarityType.DIFFERENT return SimilarityResult( similarity_score=similarity_score, similarity_type=similarity_type, is_exact_duplicate=False, comparison_method="content_analysis" ) except Exception: return SimilarityResult( similarity_score=0.0, similarity_type=SimilarityType.DIFFERENT, is_exact_duplicate=False, confidence=0.0, comparison_method="error" ) def calculate_image_similarity(self, image1: Path, image2: Path) -> SimilarityResult: """Calculate similarity between two images.""" # Mock image similarity calculation # In real implementation, would use perceptual hashing or feature comparison try: # Simple size-based similarity for mock size1 = image1.stat().st_size size2 = image2.stat().st_size if size1 == size2: # Check content content1 = image1.read_bytes() content2 = image2.read_bytes() if content1 == content2: return SimilarityResult( similarity_score=1.0, similarity_type=SimilarityType.EXACT_MATCH, is_exact_duplicate=True, comparison_method="image_hash" ) # Mock similarity based on size difference size_diff = abs(size1 - size2) max_size = max(size1, size2) similarity = 1.0 - (size_diff / max_size) if max_size > 0 else 0.0 # Simulate perceptual similarity if similarity > 0.9: similarity_type = SimilarityType.NEAR_DUPLICATE elif similarity > 0.7: similarity_type = SimilarityType.SIMILAR_CONTENT else: similarity_type = SimilarityType.DIFFERENT return SimilarityResult( similarity_score=similarity, similarity_type=similarity_type, is_exact_duplicate=False, comparison_method="perceptual_hash" ) except Exception: return SimilarityResult( similarity_score=0.0, similarity_type=SimilarityType.DIFFERENT, comparison_method="error" ) def _calculate_content_similarity(self, content1: bytes, content2: bytes) -> float: """Calculate content similarity using basic byte comparison.""" if len(content1) == 0 and len(content2) == 0: return 1.0 if len(content1) == 0 or len(content2) == 0: return 0.0 # Simple similarity: count matching bytes min_length = min(len(content1), len(content2)) max_length = max(len(content1), len(content2)) matching_bytes = sum(1 for i in range(min_length) if content1[i] == content2[i]) # Account for length difference length_similarity = min_length / max_length content_similarity = matching_bytes / min_length # Combined similarity return (content_similarity * 0.7) + (length_similarity * 0.3) class AssetMetricsCollector: """Asset metrics collection and analysis.""" def __init__(self): """Initialize metrics collector.""" self._metrics: List[AssetMetrics] = [] def collect_metrics(self, asset_path: Path) -> AssetMetrics: """Collect comprehensive metrics for an asset.""" stat_info = asset_path.stat() # Basic metrics metrics = AssetMetrics( file_size=stat_info.st_size, creation_time=stat_info.st_ctime, mime_type=self._get_mime_type(asset_path), optimization_potential=self._estimate_optimization_potential(asset_path) ) # Type-specific analysis if asset_path.suffix.lower() in {'.png', '.jpg', '.jpeg', '.gif', '.bmp', '.svg'}: analyzer = ContentAnalyzer() metrics.image_properties = analyzer.analyze_image(asset_path) elif asset_path.suffix.lower() in {'.txt', '.md', '.pdf', '.doc', '.docx'}: analyzer = ContentAnalyzer() metrics.document_properties = analyzer.analyze_document(asset_path) # Store metrics for summary self._metrics.append(metrics) return metrics def get_summary(self) -> MetricsSummary: """Get summary of all collected metrics.""" if not self._metrics: return MetricsSummary( total_assets=0, total_size=0, optimization_potential_percent=0.0 ) total_size = sum(m.file_size for m in self._metrics) avg_optimization = sum(m.optimization_potential for m in self._metrics) / len(self._metrics) return MetricsSummary( total_assets=len(self._metrics), total_size=total_size, optimization_potential_percent=avg_optimization * 100 ) def _get_mime_type(self, asset_path: Path) -> str: """Get MIME type for asset.""" suffix = asset_path.suffix.lower() mime_types = { '.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.gif': 'image/gif', '.svg': 'image/svg+xml', '.pdf': 'application/pdf', '.txt': 'text/plain', '.md': 'text/markdown' } return mime_types.get(suffix, 'application/octet-stream') def _estimate_optimization_potential(self, asset_path: Path) -> float: """Estimate optimization potential (0.0 to 1.0).""" suffix = asset_path.suffix.lower() file_size = asset_path.stat().st_size # Different formats have different optimization potential if suffix == '.png' and file_size > 100000: # Large PNG return 0.4 # 40% potential reduction elif suffix in ['.jpg', '.jpeg'] and file_size > 500000: # Large JPEG return 0.3 # 30% potential reduction elif suffix == '.svg': return 0.2 # 20% potential reduction through minification elif suffix == '.pdf' and file_size > 1000000: # Large PDF return 0.25 # 25% potential reduction else: return 0.1 # 10% general optimization potential