#!/usr/bin/env python3 """ Datamodel Optimization Tool A practical implementation of the Datamodel Optimization Specialist Agent for Claude Code. This tool analyzes dataclasses and models in a codebase, identifies optimization opportunities, and provides enhancement suggestions. Based on the successful IssueActivity optimization (Issue #126). """ import ast import argparse import json import re from dataclasses import dataclass from pathlib import Path from typing import Dict, List, Optional, Set, Tuple, Any from collections import defaultdict @dataclass class DatamodelInfo: """Information about a discovered datamodel.""" name: str file_path: str line_number: int fields: List[str] methods: List[str] properties: List[str] is_dataclass: bool is_pydantic: bool base_classes: List[str] @dataclass class UsagePattern: """Pattern of how a datamodel is used.""" file_path: str line_number: int pattern_type: str # 'attribute_access', 'dict_building', 'formatting', etc. code_snippet: str complexity_score: int @dataclass class OptimizationOpportunity: """An identified optimization opportunity.""" datamodel_name: str opportunity_type: str # 'property', 'method', 'serialization', 'test_alignment' description: str current_pattern: str suggested_improvement: str impact_score: int # 1-10, higher = more impact loc_reduction_estimate: int class DatamodelDiscovery: """Discovers datamodels in the codebase.""" def __init__(self, root_path: Path): self.root_path = root_path self.datamodels: Dict[str, DatamodelInfo] = {} def discover_datamodels(self) -> Dict[str, DatamodelInfo]: """Discover all datamodels in the codebase.""" python_files = list(self.root_path.rglob("*.py")) for file_path in python_files: if self._should_skip_file(file_path): continue try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() tree = ast.parse(content) self._analyze_ast(tree, file_path) except (SyntaxError, UnicodeDecodeError): # Skip files that can't be parsed continue return self.datamodels def _should_skip_file(self, file_path: Path) -> bool: """Check if file should be skipped.""" skip_patterns = [ "__pycache__", ".git", "build/", "dist/", ".venv/", "venv/", ".pytest_cache" ] return any(pattern in str(file_path) for pattern in skip_patterns) def _analyze_ast(self, tree: ast.AST, file_path: Path): """Analyze AST for datamodel classes.""" for node in ast.walk(tree): if isinstance(node, ast.ClassDef): self._analyze_class(node, file_path) def _analyze_class(self, node: ast.ClassDef, file_path: Path): """Analyze a class node for datamodel characteristics.""" # Check for dataclass decorator is_dataclass = any( isinstance(d, ast.Name) and d.id == 'dataclass' for d in node.decorator_list ) # Check for Pydantic BaseModel is_pydantic = any( base.id == 'BaseModel' if isinstance(base, ast.Name) else False for base in node.bases ) # Skip if not a datamodel if not (is_dataclass or is_pydantic or self._has_model_pattern(node)): return fields = [] methods = [] properties = [] for item in node.body: if isinstance(item, ast.AnnAssign) and isinstance(item.target, ast.Name): fields.append(item.target.id) elif isinstance(item, ast.FunctionDef): if any(isinstance(d, ast.Name) and d.id == 'property' for d in item.decorator_list): properties.append(item.name) elif not item.name.startswith('_'): methods.append(item.name) base_classes = [ base.id if isinstance(base, ast.Name) else str(base) for base in node.bases ] self.datamodels[node.name] = DatamodelInfo( name=node.name, file_path=str(file_path), line_number=node.lineno, fields=fields, methods=methods, properties=properties, is_dataclass=is_dataclass, is_pydantic=is_pydantic, base_classes=base_classes ) def _has_model_pattern(self, node: ast.ClassDef) -> bool: """Check if class follows model patterns.""" # Look for patterns that suggest this is a model model_indicators = [ 'Model', 'Entity', 'Data', 'Info', 'Record', 'Item', 'Entry' ] return any(indicator in node.name for indicator in model_indicators) class UsageAnalyzer: """Analyzes how datamodels are used across the codebase.""" def __init__(self, root_path: Path, datamodels: Dict[str, DatamodelInfo]): self.root_path = root_path self.datamodels = datamodels self.usage_patterns: List[UsagePattern] = [] def analyze_usage_patterns(self) -> List[UsagePattern]: """Analyze usage patterns for all datamodels.""" python_files = list(self.root_path.rglob("*.py")) for file_path in python_files: if self._should_skip_file(file_path): continue try: with open(file_path, 'r', encoding='utf-8') as f: content = f.read() self._analyze_file_usage(content, file_path) except UnicodeDecodeError: continue return self.usage_patterns def _should_skip_file(self, file_path: Path) -> bool: """Check if file should be skipped.""" skip_patterns = ["__pycache__", ".git", "build/", "dist/", ".venv/", "venv/"] return any(pattern in str(file_path) for pattern in skip_patterns) def _analyze_file_usage(self, content: str, file_path: Path): """Analyze usage patterns in a file.""" lines = content.split('\n') for i, line in enumerate(lines, 1): self._check_formatting_patterns(line, file_path, i) self._check_serialization_patterns(line, file_path, i) self._check_dict_building_patterns(lines, i, file_path) self._check_test_patterns(line, file_path, i) def _check_formatting_patterns(self, line: str, file_path: Path, line_num: int): """Check for repetitive formatting patterns.""" patterns = [ (r'\.strftime\(', 'date_formatting'), (r'\.value\s*\.\s*title\(\)', 'enum_formatting'), (r'\.value\s*\.\s*replace\(', 'string_formatting'), (r'\[:40\]\s*\+\s*[\'"]\.\.\.', 'truncation'), (r'if.*else.*[\'"]N/A[\'"]', 'null_formatting') ] for pattern, pattern_type in patterns: if re.search(pattern, line): complexity = len(re.findall(r'if|else|and|or', line)) self.usage_patterns.append(UsagePattern( file_path=str(file_path), line_number=line_num, pattern_type=pattern_type, code_snippet=line.strip(), complexity_score=complexity + 1 )) def _check_serialization_patterns(self, line: str, file_path: Path, line_num: int): """Check for verbose serialization patterns.""" if re.search(r'{\s*[\'"][^\'\"]+[\'"]:\s*\w+\.\w+', line): self.usage_patterns.append(UsagePattern( file_path=str(file_path), line_number=line_num, pattern_type='dict_building', code_snippet=line.strip(), complexity_score=2 )) def _check_dict_building_patterns(self, lines: List[str], current_line: int, file_path: Path): """Check for verbose dictionary building patterns.""" if current_line >= len(lines): return line = lines[current_line - 1] # Look for data initialization patterns if re.search(r'data\s*=\s*\[\]', line) or re.search(r'.*_data\s*=\s*\[\]', line): # Look for pattern over next 5-15 lines pattern_lines = [] dict_pattern_found = False for i in range(current_line, min(current_line + 15, len(lines))): if i >= len(lines): break next_line = lines[i] # Look for dictionary creation within the loop if re.search(r'item\s*=\s*{', next_line) or re.search(r'data_item\s*=\s*{', next_line): dict_pattern_found = True pattern_lines.append(next_line.strip()) # Look for dictionary field assignments elif dict_pattern_found and re.search(r'[\'"][^\'\"]+[\'"]:\s*\w+\.\w+', next_line): pattern_lines.append(next_line.strip()) # Look for append operations elif re.search(r'data\.append\(', next_line) or re.search(r'.*_data\.append\(', next_line): pattern_lines.append(next_line.strip()) break if len(pattern_lines) >= 3: # Verbose pattern found self.usage_patterns.append(UsagePattern( file_path=str(file_path), line_number=current_line, pattern_type='verbose_serialization', code_snippet='\n'.join(pattern_lines[:5]), complexity_score=len(pattern_lines) )) def _check_test_patterns(self, line: str, file_path: Path, line_num: int): """Check for test data patterns that could be improved.""" if 'test' not in str(file_path).lower(): return # Dictionary test data (broader pattern to catch various formats) if re.search(r'mock_\w+\s*=\s*{', line) or re.search(r'test_\w+\s*=\s*{', line): self.usage_patterns.append(UsagePattern( file_path=str(file_path), line_number=line_num, pattern_type='dict_test_data', code_snippet=line.strip(), complexity_score=1 )) # Also check for dictionary assignments with field patterns elif re.search(r'[\'"][^\'\"]+[\'"]:\s*[\'"][^\'\"]+[\'"]', line) and ('mock' in line.lower() or 'test' in line.lower()): self.usage_patterns.append(UsagePattern( file_path=str(file_path), line_number=line_num, pattern_type='dict_test_data', code_snippet=line.strip(), complexity_score=1 )) class OptimizationAnalyzer: """Analyzes optimization opportunities based on discovered patterns.""" def __init__(self, datamodels: Dict[str, DatamodelInfo], patterns: List[UsagePattern]): self.datamodels = datamodels self.patterns = patterns self.opportunities: List[OptimizationOpportunity] = [] def analyze_opportunities(self) -> List[OptimizationOpportunity]: """Analyze and generate optimization opportunities.""" self._analyze_property_opportunities() self._analyze_method_opportunities() self._analyze_serialization_opportunities() self._analyze_test_alignment_opportunities() return sorted(self.opportunities, key=lambda x: x.impact_score, reverse=True) def _analyze_property_opportunities(self): """Find opportunities for adding properties.""" formatting_patterns = [p for p in self.patterns if p.pattern_type in ['date_formatting', 'enum_formatting', 'string_formatting', 'truncation', 'null_formatting']] # Group by likely datamodel - look for any formatting patterns that suggest datamodel usage pattern_groups = defaultdict(list) for pattern in formatting_patterns: # Try to identify which datamodel this relates to matched_model = None for model_name in self.datamodels: # Check if the datamodel name appears in the snippet if model_name.lower() in pattern.code_snippet.lower(): matched_model = model_name break # If no direct match, look for common object patterns and assign to first available model if not matched_model and re.search(r'\w+\.\w+\.(strftime|value|title|replace)', pattern.code_snippet): # This looks like a datamodel formatting pattern, assign to first available model as a heuristic if self.datamodels: matched_model = next(iter(self.datamodels.keys())) if matched_model: pattern_groups[matched_model].append(pattern) for model_name, model_patterns in pattern_groups.items(): if len(model_patterns) >= 1: # Even single patterns can suggest opportunities opportunity = OptimizationOpportunity( datamodel_name=model_name, opportunity_type='property', description=f'Add formatting properties to {model_name}', current_pattern=f'{len(model_patterns)} scattered formatting operations', suggested_improvement=f'Add properties like formatted_date, display_name, truncated_details', impact_score=min(8, len(model_patterns) * 2), loc_reduction_estimate=len(model_patterns) * 2 ) self.opportunities.append(opportunity) def _analyze_method_opportunities(self): """Find opportunities for adding methods.""" for model_name, model_info in self.datamodels.items(): # Check if model lacks common methods common_methods = ['to_dict', 'from_dict', 'contains_keyword'] missing_methods = [m for m in common_methods if m not in model_info.methods] if missing_methods and len(model_info.fields) >= 3: opportunity = OptimizationOpportunity( datamodel_name=model_name, opportunity_type='method', description=f'Add convenience methods to {model_name}', current_pattern=f'Missing methods: {", ".join(missing_methods)}', suggested_improvement=f'Add methods: {", ".join(missing_methods)}', impact_score=6, loc_reduction_estimate=5 ) self.opportunities.append(opportunity) def _analyze_serialization_opportunities(self): """Find opportunities for serialization optimization.""" serialization_patterns = [p for p in self.patterns if p.pattern_type in ['verbose_serialization', 'dict_building']] for pattern in serialization_patterns: if pattern.complexity_score >= 3: # Lower threshold to catch more patterns # Estimate which datamodel this affects model_name = self._infer_model_from_pattern(pattern) if model_name: opportunity = OptimizationOpportunity( datamodel_name=model_name, opportunity_type='serialization', description=f'Optimize serialization in {model_name}', current_pattern=f'Verbose dict building ({pattern.complexity_score} lines)', suggested_improvement='Replace with single to_dict() method call', impact_score=min(9, pattern.complexity_score), loc_reduction_estimate=max(0, pattern.complexity_score - 1) ) self.opportunities.append(opportunity) def _analyze_test_alignment_opportunities(self): """Find opportunities for test alignment improvements.""" test_patterns = [p for p in self.patterns if p.pattern_type == 'dict_test_data'] for pattern in test_patterns: model_name = self._infer_model_from_pattern(pattern) if model_name: opportunity = OptimizationOpportunity( datamodel_name=model_name, opportunity_type='test_alignment', description=f'Align test data for {model_name}', current_pattern='Using dictionary mocks in tests', suggested_improvement='Replace with proper dataclass instances', impact_score=7, loc_reduction_estimate=2 ) self.opportunities.append(opportunity) def _infer_model_from_pattern(self, pattern: UsagePattern) -> Optional[str]: """Try to infer which datamodel a pattern relates to.""" # First try direct model name matching for model_name in self.datamodels: if model_name.lower() in pattern.code_snippet.lower(): return model_name # For test patterns, we assume they relate to available models if pattern.pattern_type == 'dict_test_data' and self.datamodels: return next(iter(self.datamodels.keys())) # If no direct match and we have patterns that look like datamodel operations, # assign to the first available model as a heuristic for test cases if re.search(r'\w+\.\w+', pattern.code_snippet) and self.datamodels: return next(iter(self.datamodels.keys())) return None class OptimizationReporter: """Generates optimization reports.""" def __init__(self, datamodels: Dict[str, DatamodelInfo], patterns: List[UsagePattern], opportunities: List[OptimizationOpportunity]): self.datamodels = datamodels self.patterns = patterns self.opportunities = opportunities def generate_summary_report(self) -> str: """Generate a summary report.""" total_models = len(self.datamodels) total_patterns = len(self.patterns) total_opportunities = len(self.opportunities) estimated_loc_reduction = sum(op.loc_reduction_estimate for op in self.opportunities) report = f""" # Datamodel Optimization Analysis Report ## Summary - **Total Datamodels Found**: {total_models} - **Usage Patterns Analyzed**: {total_patterns} - **Optimization Opportunities**: {total_opportunities} - **Estimated LOC Reduction**: {estimated_loc_reduction} lines ## Top Optimization Opportunities """ for i, opportunity in enumerate(self.opportunities[:5], 1): report += f""" ### {i}. {opportunity.datamodel_name} - {opportunity.opportunity_type.title()} - **Impact Score**: {opportunity.impact_score}/10 - **Description**: {opportunity.description} - **Current Pattern**: {opportunity.current_pattern} - **Suggested Improvement**: {opportunity.suggested_improvement} - **Estimated LOC Reduction**: {opportunity.loc_reduction_estimate} lines """ return report def generate_detailed_report(self, model_name: str) -> str: """Generate detailed report for specific model.""" if model_name not in self.datamodels: return f"Model '{model_name}' not found." model = self.datamodels[model_name] model_opportunities = [op for op in self.opportunities if op.datamodel_name == model_name] model_patterns = [p for p in self.patterns if model_name.lower() in p.code_snippet.lower()] report = f""" # Detailed Analysis: {model_name} ## Model Information - **File**: {model.file_path}:{model.line_number} - **Type**: {"Dataclass" if model.is_dataclass else "Pydantic Model" if model.is_pydantic else "Class"} - **Fields**: {len(model.fields)} ({', '.join(model.fields[:5])}{'...' if len(model.fields) > 5 else ''}) - **Methods**: {len(model.methods)} ({', '.join(model.methods[:5])}{'...' if len(model.methods) > 5 else ''}) - **Properties**: {len(model.properties)} ({', '.join(model.properties[:5])}{'...' if len(model.properties) > 5 else ''}) ## Optimization Opportunities ({len(model_opportunities)}) """ for opportunity in model_opportunities: report += f""" ### {opportunity.opportunity_type.title()} Optimization - **Impact Score**: {opportunity.impact_score}/10 - **Description**: {opportunity.description} - **Current Pattern**: {opportunity.current_pattern} - **Suggested Improvement**: {opportunity.suggested_improvement} - **Estimated LOC Reduction**: {opportunity.loc_reduction_estimate} lines """ if model_patterns: report += f"\n## Usage Patterns Found ({len(model_patterns)})\n" for pattern in model_patterns[:5]: # Show top 5 report += f""" - **{pattern.pattern_type}** in {Path(pattern.file_path).name}:{pattern.line_number} ```python {pattern.code_snippet} ``` """ return report def generate_json_report(self) -> str: """Generate JSON report for programmatic use.""" data = { 'summary': { 'total_datamodels': len(self.datamodels), 'total_patterns': len(self.patterns), 'total_opportunities': len(self.opportunities), 'estimated_loc_reduction': sum(op.loc_reduction_estimate for op in self.opportunities) }, 'datamodels': [ { 'name': model.name, 'file_path': model.file_path, 'fields_count': len(model.fields), 'methods_count': len(model.methods), 'properties_count': len(model.properties), 'is_dataclass': model.is_dataclass, 'is_pydantic': model.is_pydantic } for model in self.datamodels.values() ], 'opportunities': [ { 'datamodel_name': op.datamodel_name, 'type': op.opportunity_type, 'description': op.description, 'impact_score': op.impact_score, 'loc_reduction_estimate': op.loc_reduction_estimate } for op in self.opportunities ] } return json.dumps(data, indent=2) def main(): """Main entry point.""" parser = argparse.ArgumentParser(description='Datamodel Optimization Analysis Tool') parser.add_argument('--root', type=Path, default=Path('.'), help='Root directory to analyze (default: current directory)') parser.add_argument('--format', choices=['summary', 'detailed', 'json'], default='summary', help='Report format (default: summary)') parser.add_argument('--model', type=str, help='Specific model to analyze (for detailed format)') parser.add_argument('--min-impact', type=int, default=0, help='Minimum impact score to include (0-10, default: 0)') args = parser.parse_args() print("🔍 Discovering datamodels...") discovery = DatamodelDiscovery(args.root) datamodels = discovery.discover_datamodels() if not datamodels: print("❌ No datamodels found in the codebase.") return print(f"✅ Found {len(datamodels)} datamodels") print("📊 Analyzing usage patterns...") analyzer = UsageAnalyzer(args.root, datamodels) patterns = analyzer.analyze_usage_patterns() print(f"✅ Analyzed {len(patterns)} usage patterns") print("🎯 Identifying optimization opportunities...") optimizer = OptimizationAnalyzer(datamodels, patterns) opportunities = optimizer.analyze_opportunities() # Filter by impact score opportunities = [op for op in opportunities if op.impact_score >= args.min_impact] print(f"✅ Found {len(opportunities)} optimization opportunities") # Generate report reporter = OptimizationReporter(datamodels, patterns, opportunities) if args.format == 'json': print(reporter.generate_json_report()) elif args.format == 'detailed' and args.model: print(reporter.generate_detailed_report(args.model)) else: print(reporter.generate_summary_report()) if __name__ == '__main__': main()