""" Filesystem repository implementation with atomic operations. Provides reliable file operations with proper error handling, atomic writes, and workspace management. """ import os import shutil import tempfile import uuid from infrastructure.logging import get_logger from typing import List, Optional from pathlib import Path from datetime import datetime, timedelta, timezone from infrastructure.repositories.interfaces import WorkspaceRepository from infrastructure.exceptions import ( ErrorContext, OperationType, ResourceNotFoundError, DuplicateResourceError, ValidationError ) logger = get_logger(__name__) class FilesystemWorkspaceRepository(WorkspaceRepository): """ Filesystem implementation of WorkspaceRepository. Provides reliable workspace and file operations with atomic writes, proper validation, and comprehensive error handling. """ def __init__(self, base_workspace_dir: str = ".markitect_workspace"): self.base_path = Path(base_workspace_dir).resolve() self.base_path.mkdir(parents=True, exist_ok=True) logger.info(f"Initialized workspace repository at {self.base_path}") async def create_workspace( self, workspace_id: str, base_path: Path, context: Optional[ErrorContext] = None ) -> Path: """Create a new workspace directory.""" if context is None: context = ErrorContext( operation_id=f"create_workspace_{workspace_id}", operation_type=OperationType.WRITE, resource_type="Workspace", resource_id=workspace_id ) # Validate workspace ID if not self._is_valid_workspace_id(workspace_id): raise ValidationError( "workspace_id", workspace_id, "Workspace ID must be alphanumeric with optional dashes and underscores", context ) workspace_path = self.base_path / workspace_id # Check if workspace already exists if workspace_path.exists(): raise DuplicateResourceError("Workspace", workspace_id, context) try: # Create workspace directory with proper permissions workspace_path.mkdir(parents=True, exist_ok=False, mode=0o755) # Create standard subdirectories (workspace_path / "files").mkdir(exist_ok=True) (workspace_path / "temp").mkdir(exist_ok=True) (workspace_path / "logs").mkdir(exist_ok=True) # Create workspace metadata file metadata = { "id": workspace_id, "created_at": datetime.now(timezone.utc).isoformat(), "version": "1.0", "type": "markitect_workspace" } await self._write_json_file( workspace_path / ".workspace_meta.json", metadata, context ) logger.info(f"Created workspace: {workspace_id}") return workspace_path except OSError as e: logger.error(f"Failed to create workspace {workspace_id}: {e}") # Cleanup partial creation if workspace_path.exists(): shutil.rmtree(workspace_path, ignore_errors=True) raise self._map_os_error_to_exception(e, f"create workspace {workspace_id}", context) async def get_workspace_path( self, workspace_id: str, context: Optional[ErrorContext] = None ) -> Path: """Get the path to a workspace.""" if context is None: context = ErrorContext( operation_id=f"get_workspace_path_{workspace_id}", operation_type=OperationType.READ, resource_type="Workspace", resource_id=workspace_id ) workspace_path = self.base_path / workspace_id if not workspace_path.exists() or not workspace_path.is_dir(): raise ResourceNotFoundError("Workspace", workspace_id, context) return workspace_path async def list_workspaces( self, context: Optional[ErrorContext] = None ) -> List[str]: """List all available workspaces.""" if context is None: context = ErrorContext( operation_id="list_workspaces", operation_type=OperationType.READ, resource_type="Workspace" ) try: workspaces = [] if not self.base_path.exists(): return workspaces for item in self.base_path.iterdir(): if item.is_dir() and self._is_valid_workspace_id(item.name): # Verify it's a valid workspace by checking for metadata metadata_file = item / ".workspace_meta.json" if metadata_file.exists(): workspaces.append(item.name) return sorted(workspaces) except OSError as e: logger.error(f"Failed to list workspaces: {e}") raise self._map_os_error_to_exception(e, "list workspaces", context) async def write_file( self, workspace_id: str, file_path: str, content: str, context: Optional[ErrorContext] = None ) -> Path: """Write content to a file in the workspace using atomic operations.""" if context is None: context = ErrorContext( operation_id=f"write_file_{workspace_id}_{file_path}", operation_type=OperationType.WRITE, resource_type="WorkspaceFile", resource_id=f"{workspace_id}/{file_path}", request_data={"content_length": len(content)} ) # Validate inputs workspace_path = await self.get_workspace_path(workspace_id, context) if not self._is_safe_file_path(file_path): raise ValidationError( "file_path", file_path, "File path contains invalid characters or attempts directory traversal", context ) # Validate file extension allowed_extensions = {".md", ".txt", ".py", ".js", ".json", ".yaml", ".yml", ".rst", ".csv"} file_ext = Path(file_path).suffix.lower() if file_ext and file_ext not in allowed_extensions: raise ValidationError( "file_path", file_path, f"File extension {file_ext} is not allowed", context ) # Validate content size (100MB limit) max_size = 100 * 1024 * 1024 # 100MB if len(content.encode('utf-8')) > max_size: raise ValidationError( "content", f"{len(content)} characters", f"File content exceeds maximum size of {max_size} bytes", context ) target_path = workspace_path / "files" / file_path try: # Ensure parent directory exists target_path.parent.mkdir(parents=True, exist_ok=True) # Atomic write using temporary file await self._atomic_write_file(target_path, content, context) logger.info(f"Wrote file {file_path} in workspace {workspace_id}") return target_path except OSError as e: logger.error(f"Failed to write file {file_path} in workspace {workspace_id}: {e}") raise self._map_os_error_to_exception(e, f"write file {file_path}", context) async def read_file( self, workspace_id: str, file_path: str, context: Optional[ErrorContext] = None ) -> str: """Read content from a file in the workspace.""" if context is None: context = ErrorContext( operation_id=f"read_file_{workspace_id}_{file_path}", operation_type=OperationType.READ, resource_type="WorkspaceFile", resource_id=f"{workspace_id}/{file_path}" ) # Validate inputs workspace_path = await self.get_workspace_path(workspace_id, context) if not self._is_safe_file_path(file_path): raise ValidationError( "file_path", file_path, "File path contains invalid characters or attempts directory traversal", context ) target_path = workspace_path / "files" / file_path if not target_path.exists(): raise ResourceNotFoundError("File", f"{workspace_id}/{file_path}", context) if not target_path.is_file(): raise ValidationError( "file_path", file_path, "Path exists but is not a regular file", context ) try: # Read file with encoding detection content = target_path.read_text(encoding='utf-8') logger.debug(f"Read file {file_path} from workspace {workspace_id}") return content except UnicodeDecodeError as e: logger.error(f"Failed to decode file {file_path} as UTF-8: {e}") raise ValidationError( "file_content", "binary data", "File does not contain valid UTF-8 text", context ) except OSError as e: logger.error(f"Failed to read file {file_path} from workspace {workspace_id}: {e}") raise self._map_os_error_to_exception(e, f"read file {file_path}", context) async def delete_workspace( self, workspace_id: str, context: Optional[ErrorContext] = None ) -> bool: """Delete a workspace and all its contents.""" if context is None: context = ErrorContext( operation_id=f"delete_workspace_{workspace_id}", operation_type=OperationType.DELETE, resource_type="Workspace", resource_id=workspace_id ) workspace_path = await self.get_workspace_path(workspace_id, context) try: # Use shutil.rmtree for recursive deletion shutil.rmtree(workspace_path) logger.info(f"Deleted workspace: {workspace_id}") return True except OSError as e: logger.error(f"Failed to delete workspace {workspace_id}: {e}") raise self._map_os_error_to_exception(e, f"delete workspace {workspace_id}", context) async def list_files( self, workspace_id: str, pattern: Optional[str] = None, context: Optional[ErrorContext] = None ) -> List[str]: """List files in a workspace.""" if context is None: context = ErrorContext( operation_id=f"list_files_{workspace_id}", operation_type=OperationType.READ, resource_type="WorkspaceFile", metadata={"workspace_id": workspace_id, "pattern": pattern} ) workspace_path = await self.get_workspace_path(workspace_id, context) files_dir = workspace_path / "files" if not files_dir.exists(): return [] try: files = [] # Walk through all files in the workspace for item in files_dir.rglob("*"): if item.is_file(): # Get relative path from files directory relative_path = str(item.relative_to(files_dir)) # Apply pattern filter if provided if pattern is None or self._matches_pattern(relative_path, pattern): files.append(relative_path) return sorted(files) except OSError as e: logger.error(f"Failed to list files in workspace {workspace_id}: {e}") raise self._map_os_error_to_exception(e, f"list files in workspace {workspace_id}", context) async def cleanup_old_workspaces(self, days_threshold: int = 30) -> int: """Clean up workspaces older than specified days.""" logger.info(f"Starting cleanup of workspaces older than {days_threshold} days") try: cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold) deleted_count = 0 if not self.base_path.exists(): return 0 for workspace_dir in self.base_path.iterdir(): if not workspace_dir.is_dir(): continue try: # Check workspace metadata for creation date metadata_file = workspace_dir / ".workspace_meta.json" if not metadata_file.exists(): continue metadata = await self._read_json_file(metadata_file) created_at_str = metadata.get("created_at") if not created_at_str: continue created_at = datetime.fromisoformat(created_at_str.replace("Z", "+00:00")) if created_at < cutoff_date: await self.delete_workspace(workspace_dir.name) deleted_count += 1 logger.info(f"Cleaned up old workspace: {workspace_dir.name}") except Exception as e: logger.warning(f"Failed to process workspace {workspace_dir.name} during cleanup: {e}") continue logger.info(f"Cleanup completed: deleted {deleted_count} old workspaces") return deleted_count except Exception as e: logger.error(f"Error during workspace cleanup: {e}") return 0 # Helper methods def _is_valid_workspace_id(self, workspace_id: str) -> bool: """Validate workspace ID format.""" if not workspace_id or len(workspace_id) > 100: return False # Allow alphanumeric, dash, underscore import re return re.match(r'^[a-zA-Z0-9_-]+$', workspace_id) is not None def _is_safe_file_path(self, file_path: str) -> bool: """Check if file path is safe (no directory traversal).""" if not file_path: return False # Normalize path normalized = os.path.normpath(file_path) # Check for directory traversal attempts if normalized.startswith("..") or "/.." in normalized or "\\.." in normalized: return False # Check for absolute paths if os.path.isabs(normalized): return False # Check for unsafe characters unsafe_chars = {"<", ">", ":", "\"", "|", "?", "*", "\0"} if any(char in file_path for char in unsafe_chars): return False return True def _matches_pattern(self, file_path: str, pattern: str) -> bool: """Check if file path matches the given pattern.""" import fnmatch return fnmatch.fnmatch(file_path.lower(), pattern.lower()) async def _atomic_write_file(self, target_path: Path, content: str, context: ErrorContext): """Write file atomically using temporary file.""" temp_dir = target_path.parent / ".tmp" temp_dir.mkdir(exist_ok=True) # Create temporary file in same directory as target temp_fd, temp_path = tempfile.mkstemp( dir=temp_dir, prefix=f".tmp_{target_path.name}_", suffix=".tmp" ) try: # Write content to temporary file with os.fdopen(temp_fd, 'w', encoding='utf-8') as f: f.write(content) f.flush() os.fsync(f.fileno()) # Ensure data is written to disk # Atomic move to final location temp_path_obj = Path(temp_path) temp_path_obj.replace(target_path) except Exception: # Clean up temporary file on error try: os.unlink(temp_path) except OSError: pass raise finally: # Clean up temp directory if empty try: temp_dir.rmdir() except OSError: pass # Directory not empty or doesn't exist async def _write_json_file(self, file_path: Path, data: dict, context: Optional[ErrorContext] = None): """Write JSON data to file atomically.""" import json json_content = json.dumps(data, indent=2) await self._atomic_write_file(file_path, json_content, context) async def _read_json_file(self, file_path: Path) -> dict: """Read JSON data from file.""" import json content = file_path.read_text(encoding='utf-8') return json.loads(content) def _map_os_error_to_exception(self, os_error: OSError, operation: str, context: ErrorContext): """Map OS errors to appropriate domain exceptions.""" from infrastructure.exceptions import ( ResourceNotFoundError, ValidationError, DatabaseError ) if os_error.errno == 2: # No such file or directory return ResourceNotFoundError("File", operation, context) elif os_error.errno == 13: # Permission denied return ValidationError("permissions", operation, "Permission denied", context) elif os_error.errno == 28: # No space left on device return DatabaseError(f"Insufficient disk space for {operation}", os_error, context) elif os_error.errno == 17: # File exists return DuplicateResourceError("File", operation, context) else: return DatabaseError(f"Filesystem error during {operation}", os_error, context)