""" SQLite repository implementation with transaction support. Provides efficient database operations with connection pooling, transaction management, and proper error handling. """ import sqlite3 import json import uuid from infrastructure.logging import get_logger from typing import List, Optional, Dict, Any from datetime import datetime, timezone from pathlib import Path from contextlib import asynccontextmanager from infrastructure.repositories.interfaces import DocumentRepository, CacheRepository from infrastructure.connection_manager import ConnectionManager from infrastructure.exceptions import ( ErrorContext, OperationType, DatabaseError, ConnectionError, ResourceNotFoundError, DuplicateResourceError, ValidationError, TransactionError, QueryError ) logger = get_logger(__name__) class SqliteDocumentRepository(DocumentRepository): """ SQLite implementation of DocumentRepository with transaction support. Provides efficient document storage and retrieval with proper transaction handling and optimized database operations. """ def __init__(self, connection_manager: ConnectionManager): self.connection_manager = connection_manager self._initialize_schema() def _initialize_schema(self): """Initialize database schema for documents.""" try: conn = self.connection_manager.get_database_connection() # Create documents table conn.execute(""" CREATE TABLE IF NOT EXISTS documents ( id TEXT PRIMARY KEY, filename TEXT NOT NULL, content TEXT NOT NULL, ast_json TEXT NOT NULL, content_hash TEXT NOT NULL, file_size INTEGER NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, UNIQUE(filename, content_hash) ) """) # Create cache table conn.execute(""" CREATE TABLE IF NOT EXISTS ast_cache ( id TEXT PRIMARY KEY, document_id TEXT NOT NULL, cache_path TEXT NOT NULL, cache_size INTEGER NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, accessed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, FOREIGN KEY (document_id) REFERENCES documents (id) ON DELETE CASCADE ) """) # Create indexes for performance conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_filename ON documents(filename)") conn.execute("CREATE INDEX IF NOT EXISTS idx_documents_created_at ON documents(created_at)") conn.execute("CREATE INDEX IF NOT EXISTS idx_cache_document_id ON ast_cache(document_id)") conn.execute("CREATE INDEX IF NOT EXISTS idx_cache_accessed_at ON ast_cache(accessed_at)") conn.commit() logger.info("Database schema initialized successfully") except Exception as e: logger.error(f"Failed to initialize database schema: {e}") raise ConnectionError("markitect.db", e) async def store_document( self, filename: str, content: str, ast: Dict[str, Any], context: Optional[ErrorContext] = None ) -> str: """Store a document with its AST representation.""" if context is None: context = ErrorContext( operation_id=f"store_document_{filename}", operation_type=OperationType.WRITE, resource_type="Document", request_data={ "filename": filename, "content_length": len(content), "ast_keys": list(ast.keys()) if ast else [] } ) # Validate input if not filename or not filename.strip(): raise ValidationError("filename", filename, "Filename cannot be empty", context) if not content: raise ValidationError("content", content, "Content cannot be empty", context) if not ast: raise ValidationError("ast", ast, "AST cannot be empty", context) try: async with self.connection_manager.transaction() as conn: # Generate unique document ID document_id = str(uuid.uuid4()) # Calculate content hash for deduplication import hashlib content_hash = hashlib.sha256(content.encode()).hexdigest() # Check for duplicate content cursor = conn.execute( "SELECT id FROM documents WHERE filename = ? AND content_hash = ?", (filename, content_hash) ) existing = cursor.fetchone() if existing: raise DuplicateResourceError("Document", filename, context) # Store document ast_json = json.dumps(ast) file_size = len(content) now = datetime.now(timezone.utc).isoformat() conn.execute(""" INSERT INTO documents (id, filename, content, ast_json, content_hash, file_size, created_at, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?) """, (document_id, filename, content, ast_json, content_hash, file_size, now, now)) logger.info(f"Stored document {filename} with ID {document_id}") return document_id except sqlite3.IntegrityError as e: if "UNIQUE constraint failed" in str(e): raise DuplicateResourceError("Document", filename, context) else: raise DatabaseError(f"Integrity error storing document {filename}", e, context) except Exception as e: logger.error(f"Error storing document {filename}: {e}") raise TransactionError(f"store document {filename}", e, context) async def get_document( self, document_id: str, context: Optional[ErrorContext] = None ) -> Dict[str, Any]: """Retrieve a document by its ID.""" if context is None: context = ErrorContext( operation_id=f"get_document_{document_id}", operation_type=OperationType.READ, resource_type="Document", resource_id=document_id ) try: conn = self.connection_manager.get_database_connection() cursor = conn.execute(""" SELECT id, filename, content, ast_json, content_hash, file_size, created_at, updated_at FROM documents WHERE id = ? """, (document_id,)) row = cursor.fetchone() if not row: raise ResourceNotFoundError("Document", document_id, context) # Parse the row data return { "id": row[0], "filename": row[1], "content": row[2], "ast": json.loads(row[3]), "content_hash": row[4], "file_size": row[5], "created_at": row[6], "updated_at": row[7] } except ResourceNotFoundError: # Re-raise ResourceNotFoundError as-is raise except json.JSONDecodeError as e: logger.error(f"Failed to parse AST JSON for document {document_id}: {e}") raise QueryError( f"SELECT * FROM documents WHERE id = '{document_id}'", {"document_id": document_id}, e, context ) except Exception as e: logger.error(f"Error retrieving document {document_id}: {e}") raise QueryError( f"SELECT * FROM documents WHERE id = '{document_id}'", {"document_id": document_id}, e, context ) async def get_documents( self, filename_pattern: Optional[str] = None, limit: int = 100, offset: int = 0, context: Optional[ErrorContext] = None ) -> List[Dict[str, Any]]: """Retrieve multiple documents with filtering and pagination.""" if context is None: context = ErrorContext( operation_id=f"get_documents_{filename_pattern or 'all'}", operation_type=OperationType.READ, resource_type="Document", metadata={ "filename_pattern": filename_pattern, "limit": limit, "offset": offset } ) try: conn = self.connection_manager.get_database_connection() # Build query based on filter if filename_pattern: query = """ SELECT id, filename, content, ast_json, content_hash, file_size, created_at, updated_at FROM documents WHERE filename LIKE ? ORDER BY created_at DESC LIMIT ? OFFSET ? """ params = (f"%{filename_pattern}%", limit, offset) else: query = """ SELECT id, filename, content, ast_json, content_hash, file_size, created_at, updated_at FROM documents ORDER BY created_at DESC LIMIT ? OFFSET ? """ params = (limit, offset) cursor = conn.execute(query, params) rows = cursor.fetchall() documents = [] for row in rows: try: document = { "id": row[0], "filename": row[1], "content": row[2], "ast": json.loads(row[3]), "content_hash": row[4], "file_size": row[5], "created_at": row[6], "updated_at": row[7] } documents.append(document) except json.JSONDecodeError as e: logger.warning(f"Skipping document {row[0]} due to invalid AST JSON: {e}") continue return documents except Exception as e: logger.error(f"Error retrieving documents: {e}") raise QueryError("SELECT documents with pagination", {"limit": limit, "offset": offset}, e, context) async def update_document( self, document_id: str, content: Optional[str] = None, ast: Optional[Dict[str, Any]] = None, context: Optional[ErrorContext] = None ) -> Dict[str, Any]: """Update an existing document.""" if context is None: context = ErrorContext( operation_id=f"update_document_{document_id}", operation_type=OperationType.UPDATE, resource_type="Document", resource_id=document_id, request_data={ "content_length": len(content) if content else None, "ast_keys": list(ast.keys()) if ast else None } ) try: async with self.connection_manager.transaction() as conn: # Check if document exists cursor = conn.execute("SELECT id FROM documents WHERE id = ?", (document_id,)) if not cursor.fetchone(): raise ResourceNotFoundError("Document", document_id, context) # Build update query updates = [] params = [] if content is not None: # Recalculate content hash import hashlib content_hash = hashlib.sha256(content.encode()).hexdigest() file_size = len(content) updates.extend(["content = ?", "content_hash = ?", "file_size = ?"]) params.extend([content, content_hash, file_size]) if ast is not None: ast_json = json.dumps(ast) updates.append("ast_json = ?") params.append(ast_json) if not updates: # No changes to make return await self.get_document(document_id, context) # Add updated timestamp updates.append("updated_at = ?") params.append(datetime.now(timezone.utc).isoformat()) # Add document_id for WHERE clause params.append(document_id) query = f"UPDATE documents SET {', '.join(updates)} WHERE id = ?" conn.execute(query, params) logger.info(f"Updated document {document_id}") # Return updated document return await self.get_document(document_id, context) except Exception as e: logger.error(f"Error updating document {document_id}: {e}") raise TransactionError(f"update document {document_id}", e, context) async def delete_document( self, document_id: str, context: Optional[ErrorContext] = None ) -> bool: """Delete a document.""" if context is None: context = ErrorContext( operation_id=f"delete_document_{document_id}", operation_type=OperationType.DELETE, resource_type="Document", resource_id=document_id ) try: async with self.connection_manager.transaction() as conn: # Check if document exists cursor = conn.execute("SELECT id FROM documents WHERE id = ?", (document_id,)) if not cursor.fetchone(): raise ResourceNotFoundError("Document", document_id, context) # Delete associated cache entries first (due to foreign key) conn.execute("DELETE FROM ast_cache WHERE document_id = ?", (document_id,)) # Delete document cursor = conn.execute("DELETE FROM documents WHERE id = ?", (document_id,)) deleted = cursor.rowcount > 0 if deleted: logger.info(f"Deleted document {document_id}") return deleted except Exception as e: logger.error(f"Error deleting document {document_id}: {e}") raise TransactionError(f"delete document {document_id}", e, context) async def get_cache_path( self, document_id: str, context: Optional[ErrorContext] = None ) -> Path: """Get the cache file path for a document.""" if context is None: context = ErrorContext( operation_id=f"get_cache_path_{document_id}", operation_type=OperationType.READ, resource_type="CachePath", resource_id=document_id ) try: conn = self.connection_manager.get_database_connection() cursor = conn.execute(""" SELECT cache_path FROM ast_cache WHERE document_id = ? """, (document_id,)) row = cursor.fetchone() if not row: raise ResourceNotFoundError("Cache", document_id, context) return Path(row[0]) except Exception as e: logger.error(f"Error getting cache path for document {document_id}: {e}") raise QueryError( f"SELECT cache_path FROM ast_cache WHERE document_id = '{document_id}'", {"document_id": document_id}, e, context ) class SqliteCacheRepository(CacheRepository): """ SQLite implementation of CacheRepository. Provides efficient caching operations using SQLite as storage backend. """ def __init__(self, connection_manager: ConnectionManager): self.connection_manager = connection_manager self._initialize_cache_schema() def _initialize_cache_schema(self): """Initialize database schema for cache operations.""" try: conn = self.connection_manager.get_database_connection() # Create cache entries table conn.execute(""" CREATE TABLE IF NOT EXISTS cache_entries ( key TEXT PRIMARY KEY, value_json TEXT NOT NULL, ttl_expires_at TIMESTAMP, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, accessed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) # Create index for TTL cleanup conn.execute("CREATE INDEX IF NOT EXISTS idx_cache_ttl ON cache_entries(ttl_expires_at)") conn.execute("CREATE INDEX IF NOT EXISTS idx_cache_accessed ON cache_entries(accessed_at)") conn.commit() logger.info("Cache schema initialized successfully") except Exception as e: logger.error(f"Failed to initialize cache schema: {e}") raise ConnectionError("markitect.db", e) async def get( self, key: str, context: Optional[ErrorContext] = None ) -> Optional[Any]: """Retrieve a value from cache.""" if context is None: context = ErrorContext( operation_id=f"cache_get_{key}", operation_type=OperationType.READ, resource_type="Cache", resource_id=key ) try: conn = self.connection_manager.get_database_connection() # Clean up expired entries first await self._cleanup_expired_entries(conn) cursor = conn.execute(""" SELECT value_json FROM cache_entries WHERE key = ? AND (ttl_expires_at IS NULL OR ttl_expires_at > CURRENT_TIMESTAMP) """, (key,)) row = cursor.fetchone() if row: # Update access time conn.execute(""" UPDATE cache_entries SET accessed_at = CURRENT_TIMESTAMP WHERE key = ? """, (key,)) conn.commit() return json.loads(row[0]) return None except json.JSONDecodeError as e: logger.error(f"Failed to parse cached value for key {key}: {e}") # Remove corrupted cache entry conn.execute("DELETE FROM cache_entries WHERE key = ?", (key,)) conn.commit() return None except Exception as e: logger.error(f"Error getting cache value for key {key}: {e}") return None async def set( self, key: str, value: Any, ttl: Optional[int] = None, context: Optional[ErrorContext] = None ) -> bool: """Store a value in cache.""" if context is None: context = ErrorContext( operation_id=f"cache_set_{key}", operation_type=OperationType.WRITE, resource_type="Cache", resource_id=key, request_data={"ttl": ttl} ) try: conn = self.connection_manager.get_database_connection() # Calculate expiration time expires_at = None if ttl: from datetime import timedelta expires_at = (datetime.now(timezone.utc) + timedelta(seconds=ttl)).isoformat() # Serialize value value_json = json.dumps(value) # Upsert cache entry conn.execute(""" INSERT OR REPLACE INTO cache_entries (key, value_json, ttl_expires_at, created_at, accessed_at) VALUES (?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) """, (key, value_json, expires_at)) conn.commit() return True except Exception as e: logger.error(f"Error setting cache value for key {key}: {e}") return False async def delete( self, key: str, context: Optional[ErrorContext] = None ) -> bool: """Delete a value from cache.""" if context is None: context = ErrorContext( operation_id=f"cache_delete_{key}", operation_type=OperationType.DELETE, resource_type="Cache", resource_id=key ) try: conn = self.connection_manager.get_database_connection() cursor = conn.execute("DELETE FROM cache_entries WHERE key = ?", (key,)) conn.commit() return cursor.rowcount > 0 except Exception as e: logger.error(f"Error deleting cache value for key {key}: {e}") return False async def invalidate_pattern( self, pattern: str, context: Optional[ErrorContext] = None ) -> int: """Invalidate cache entries matching a pattern.""" if context is None: context = ErrorContext( operation_id=f"cache_invalidate_{pattern}", operation_type=OperationType.DELETE, resource_type="Cache", metadata={"pattern": pattern} ) try: conn = self.connection_manager.get_database_connection() # Convert pattern to SQL LIKE pattern sql_pattern = pattern.replace("*", "%") cursor = conn.execute("DELETE FROM cache_entries WHERE key LIKE ?", (sql_pattern,)) conn.commit() deleted_count = cursor.rowcount logger.info(f"Invalidated {deleted_count} cache entries matching pattern '{pattern}'") return deleted_count except Exception as e: logger.error(f"Error invalidating cache pattern {pattern}: {e}") raise QueryError(f"DELETE FROM cache_entries WHERE key LIKE '{pattern}'", {"pattern": pattern}, e, context) async def store_ast_cache( self, document_id: str, ast: Dict[str, Any], context: Optional[ErrorContext] = None ) -> bool: """Store AST cache for a document.""" if context is None: context = ErrorContext( operation_id=f"store_ast_cache_{document_id}", operation_type=OperationType.WRITE, resource_type="ASTCache", resource_id=document_id ) try: conn = self.connection_manager.get_database_connection() # Generate cache file path cache_id = str(uuid.uuid4()) cache_path = f".cache/ast/{document_id}/{cache_id}.json" # Create cache directory cache_dir = Path(cache_path).parent cache_dir.mkdir(parents=True, exist_ok=True) # Write AST to cache file with open(cache_path, 'w') as f: json.dump(ast, f, indent=2) cache_size = Path(cache_path).stat().st_size # Store cache metadata in database conn.execute(""" INSERT OR REPLACE INTO ast_cache (id, document_id, cache_path, cache_size, created_at, accessed_at) VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP, CURRENT_TIMESTAMP) """, (cache_id, document_id, cache_path, cache_size)) conn.commit() logger.info(f"Stored AST cache for document {document_id} at {cache_path}") return True except Exception as e: logger.error(f"Error storing AST cache for document {document_id}: {e}") return False async def _cleanup_expired_entries(self, conn: sqlite3.Connection): """Clean up expired cache entries.""" try: cursor = conn.execute("DELETE FROM cache_entries WHERE ttl_expires_at < CURRENT_TIMESTAMP") deleted_count = cursor.rowcount if deleted_count > 0: logger.debug(f"Cleaned up {deleted_count} expired cache entries") except Exception as e: logger.warning(f"Error cleaning up expired cache entries: {e}")