markitect-main/infrastructure/repositories/filesystem_repository.py

"""
Filesystem repository implementation with atomic operations.

Provides reliable file operations with proper error handling,
atomic writes, and workspace management.
"""

import os
import shutil
import tempfile
import uuid
from infrastructure.logging import get_logger
from typing import List, Optional
from pathlib import Path
from datetime import datetime, timedelta, timezone

from infrastructure.repositories.interfaces import WorkspaceRepository
from infrastructure.exceptions import (
    ErrorContext, OperationType, ResourceNotFoundError,
    DuplicateResourceError, ValidationError
)

logger = get_logger(__name__)


class FilesystemWorkspaceRepository(WorkspaceRepository):
    """
    Filesystem implementation of WorkspaceRepository.

    Provides reliable workspace and file operations with atomic writes,
    proper validation, and comprehensive error handling.
    """

    def __init__(self, base_workspace_dir: str = ".markitect_workspace"):
        self.base_path = Path(base_workspace_dir).resolve()
        self.base_path.mkdir(parents=True, exist_ok=True)
        logger.info(f"Initialized workspace repository at {self.base_path}")

    async def create_workspace(
        self,
        workspace_id: str,
        base_path: Path,
        context: Optional[ErrorContext] = None
    ) -> Path:
        """Create a new workspace directory."""
        if context is None:
            context = ErrorContext(
                operation_id=f"create_workspace_{workspace_id}",
                operation_type=OperationType.WRITE,
                resource_type="Workspace",
                resource_id=workspace_id
            )

        # Validate workspace ID
        if not self._is_valid_workspace_id(workspace_id):
            raise ValidationError(
                "workspace_id",
                workspace_id,
                "Workspace ID must be alphanumeric with optional dashes and underscores",
                context
            )

        workspace_path = self.base_path / workspace_id

        # Check if workspace already exists
        if workspace_path.exists():
            raise DuplicateResourceError("Workspace", workspace_id, context)

        try:
            # Create workspace directory with proper permissions
            workspace_path.mkdir(parents=True, exist_ok=False, mode=0o755)

            # Create standard subdirectories
            (workspace_path / "files").mkdir(exist_ok=True)
            (workspace_path / "temp").mkdir(exist_ok=True)
            (workspace_path / "logs").mkdir(exist_ok=True)

            # Create workspace metadata file
            metadata = {
                "id": workspace_id,
                "created_at": datetime.now(timezone.utc).isoformat(),
                "version": "1.0",
                "type": "markitect_workspace"
            }

            await self._write_json_file(
                workspace_path / ".workspace_meta.json",
                metadata,
                context
            )

            logger.info(f"Created workspace: {workspace_id}")
            return workspace_path

        except OSError as e:
            logger.error(f"Failed to create workspace {workspace_id}: {e}")
            # Cleanup partial creation
            if workspace_path.exists():
                shutil.rmtree(workspace_path, ignore_errors=True)

            raise self._map_os_error_to_exception(e, f"create workspace {workspace_id}", context)

    async def get_workspace_path(
        self,
        workspace_id: str,
        context: Optional[ErrorContext] = None
    ) -> Path:
        """Get the path to a workspace."""
        if context is None:
            context = ErrorContext(
                operation_id=f"get_workspace_path_{workspace_id}",
                operation_type=OperationType.READ,
                resource_type="Workspace",
                resource_id=workspace_id
            )

        workspace_path = self.base_path / workspace_id

        if not workspace_path.exists() or not workspace_path.is_dir():
            raise ResourceNotFoundError("Workspace", workspace_id, context)

        return workspace_path

    async def list_workspaces(
        self,
        context: Optional[ErrorContext] = None
    ) -> List[str]:
        """List all available workspaces."""
        if context is None:
            context = ErrorContext(
                operation_id="list_workspaces",
                operation_type=OperationType.READ,
                resource_type="Workspace"
            )

        try:
            workspaces = []

            if not self.base_path.exists():
                return workspaces

            for item in self.base_path.iterdir():
                if item.is_dir() and self._is_valid_workspace_id(item.name):
                    # Verify it's a valid workspace by checking for metadata
                    metadata_file = item / ".workspace_meta.json"
                    if metadata_file.exists():
                        workspaces.append(item.name)

            return sorted(workspaces)

        except OSError as e:
            logger.error(f"Failed to list workspaces: {e}")
            raise self._map_os_error_to_exception(e, "list workspaces", context)

    async def write_file(
        self,
        workspace_id: str,
        file_path: str,
        content: str,
        context: Optional[ErrorContext] = None
    ) -> Path:
        """Write content to a file in the workspace using atomic operations."""
        if context is None:
            context = ErrorContext(
                operation_id=f"write_file_{workspace_id}_{file_path}",
                operation_type=OperationType.WRITE,
                resource_type="WorkspaceFile",
                resource_id=f"{workspace_id}/{file_path}",
                request_data={"content_length": len(content)}
            )

        # Validate inputs
        workspace_path = await self.get_workspace_path(workspace_id, context)

        if not self._is_safe_file_path(file_path):
            raise ValidationError(
                "file_path",
                file_path,
                "File path contains invalid characters or attempts directory traversal",
                context
            )

        # Validate file extension
        allowed_extensions = {".md", ".txt", ".py", ".js", ".json", ".yaml", ".yml", ".rst", ".csv"}
        file_ext = Path(file_path).suffix.lower()
        if file_ext and file_ext not in allowed_extensions:
            raise ValidationError(
                "file_path",
                file_path,
                f"File extension {file_ext} is not allowed",
                context
            )

        # Validate content size (100MB limit)
        max_size = 100 * 1024 * 1024  # 100MB
        if len(content.encode('utf-8')) > max_size:
            raise ValidationError(
                "content",
                f"{len(content)} characters",
                f"File content exceeds maximum size of {max_size} bytes",
                context
            )

        target_path = workspace_path / "files" / file_path

        try:
            # Ensure parent directory exists
            target_path.parent.mkdir(parents=True, exist_ok=True)

            # Atomic write using temporary file
            await self._atomic_write_file(target_path, content, context)

            logger.info(f"Wrote file {file_path} in workspace {workspace_id}")
            return target_path

        except OSError as e:
            logger.error(f"Failed to write file {file_path} in workspace {workspace_id}: {e}")
            raise self._map_os_error_to_exception(e, f"write file {file_path}", context)

    async def read_file(
        self,
        workspace_id: str,
        file_path: str,
        context: Optional[ErrorContext] = None
    ) -> str:
        """Read content from a file in the workspace."""
        if context is None:
            context = ErrorContext(
                operation_id=f"read_file_{workspace_id}_{file_path}",
                operation_type=OperationType.READ,
                resource_type="WorkspaceFile",
                resource_id=f"{workspace_id}/{file_path}"
            )

        # Validate inputs
        workspace_path = await self.get_workspace_path(workspace_id, context)

        if not self._is_safe_file_path(file_path):
            raise ValidationError(
                "file_path",
                file_path,
                "File path contains invalid characters or attempts directory traversal",
                context
            )

        target_path = workspace_path / "files" / file_path

        if not target_path.exists():
            raise ResourceNotFoundError("File", f"{workspace_id}/{file_path}", context)

        if not target_path.is_file():
            raise ValidationError(
                "file_path",
                file_path,
                "Path exists but is not a regular file",
                context
            )

        try:
            # Read file with encoding detection
            content = target_path.read_text(encoding='utf-8')

            logger.debug(f"Read file {file_path} from workspace {workspace_id}")
            return content

        except UnicodeDecodeError as e:
            logger.error(f"Failed to decode file {file_path} as UTF-8: {e}")
            raise ValidationError(
                "file_content",
                "binary data",
                "File does not contain valid UTF-8 text",
                context
            )

        except OSError as e:
            logger.error(f"Failed to read file {file_path} from workspace {workspace_id}: {e}")
            raise self._map_os_error_to_exception(e, f"read file {file_path}", context)

    async def delete_workspace(
        self,
        workspace_id: str,
        context: Optional[ErrorContext] = None
    ) -> bool:
        """Delete a workspace and all its contents."""
        if context is None:
            context = ErrorContext(
                operation_id=f"delete_workspace_{workspace_id}",
                operation_type=OperationType.DELETE,
                resource_type="Workspace",
                resource_id=workspace_id
            )

        workspace_path = await self.get_workspace_path(workspace_id, context)

        try:
            # Use shutil.rmtree for recursive deletion
            shutil.rmtree(workspace_path)

            logger.info(f"Deleted workspace: {workspace_id}")
            return True

        except OSError as e:
            logger.error(f"Failed to delete workspace {workspace_id}: {e}")
            raise self._map_os_error_to_exception(e, f"delete workspace {workspace_id}", context)

    async def list_files(
        self,
        workspace_id: str,
        pattern: Optional[str] = None,
        context: Optional[ErrorContext] = None
    ) -> List[str]:
        """List files in a workspace."""
        if context is None:
            context = ErrorContext(
                operation_id=f"list_files_{workspace_id}",
                operation_type=OperationType.READ,
                resource_type="WorkspaceFile",
                metadata={"workspace_id": workspace_id, "pattern": pattern}
            )

        workspace_path = await self.get_workspace_path(workspace_id, context)
        files_dir = workspace_path / "files"

        if not files_dir.exists():
            return []

        try:
            files = []

            # Walk through all files in the workspace
            for item in files_dir.rglob("*"):
                if item.is_file():
                    # Get relative path from files directory
                    relative_path = str(item.relative_to(files_dir))

                    # Apply pattern filter if provided
                    if pattern is None or self._matches_pattern(relative_path, pattern):
                        files.append(relative_path)

            return sorted(files)

        except OSError as e:
            logger.error(f"Failed to list files in workspace {workspace_id}: {e}")
            raise self._map_os_error_to_exception(e, f"list files in workspace {workspace_id}", context)

    async def cleanup_old_workspaces(self, days_threshold: int = 30) -> int:
        """Clean up workspaces older than specified days."""
        logger.info(f"Starting cleanup of workspaces older than {days_threshold} days")

        try:
            cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold)
            deleted_count = 0

            if not self.base_path.exists():
                return 0

            for workspace_dir in self.base_path.iterdir():
                if not workspace_dir.is_dir():
                    continue

                try:
                    # Check workspace metadata for creation date
                    metadata_file = workspace_dir / ".workspace_meta.json"
                    if not metadata_file.exists():
                        continue

                    metadata = await self._read_json_file(metadata_file)
                    created_at_str = metadata.get("created_at")

                    if not created_at_str:
                        continue

                    created_at = datetime.fromisoformat(created_at_str.replace("Z", "+00:00"))

                    if created_at < cutoff_date:
                        await self.delete_workspace(workspace_dir.name)
                        deleted_count += 1
                        logger.info(f"Cleaned up old workspace: {workspace_dir.name}")

                except Exception as e:
                    logger.warning(f"Failed to process workspace {workspace_dir.name} during cleanup: {e}")
                    continue

            logger.info(f"Cleanup completed: deleted {deleted_count} old workspaces")
            return deleted_count

        except Exception as e:
            logger.error(f"Error during workspace cleanup: {e}")
            return 0

    # Helper methods

    def _is_valid_workspace_id(self, workspace_id: str) -> bool:
        """Validate workspace ID format."""
        if not workspace_id or len(workspace_id) > 100:
            return False

        # Allow alphanumeric, dash, underscore
        import re
        return re.match(r'^[a-zA-Z0-9_-]+$', workspace_id) is not None

    def _is_safe_file_path(self, file_path: str) -> bool:
        """Check if file path is safe (no directory traversal)."""
        if not file_path:
            return False

        # Normalize path
        normalized = os.path.normpath(file_path)

        # Check for directory traversal attempts
        if normalized.startswith("..") or "/.." in normalized or "\\.." in normalized:
            return False

        # Check for absolute paths
        if os.path.isabs(normalized):
            return False

        # Check for unsafe characters
        unsafe_chars = {"<", ">", ":", "\"", "|", "?", "*", "\0"}
        if any(char in file_path for char in unsafe_chars):
            return False

        return True

    def _matches_pattern(self, file_path: str, pattern: str) -> bool:
        """Check if file path matches the given pattern."""
        import fnmatch
        return fnmatch.fnmatch(file_path.lower(), pattern.lower())

    async def _atomic_write_file(self, target_path: Path, content: str, context: ErrorContext):
        """Write file atomically using temporary file."""
        temp_dir = target_path.parent / ".tmp"
        temp_dir.mkdir(exist_ok=True)

        # Create temporary file in same directory as target
        temp_fd, temp_path = tempfile.mkstemp(
            dir=temp_dir,
            prefix=f".tmp_{target_path.name}_",
            suffix=".tmp"
        )

        try:
            # Write content to temporary file
            with os.fdopen(temp_fd, 'w', encoding='utf-8') as f:
                f.write(content)
                f.flush()
                os.fsync(f.fileno())  # Ensure data is written to disk

            # Atomic move to final location
            temp_path_obj = Path(temp_path)
            temp_path_obj.replace(target_path)

        except Exception:
            # Clean up temporary file on error
            try:
                os.unlink(temp_path)
            except OSError:
                pass
            raise

        finally:
            # Clean up temp directory if empty
            try:
                temp_dir.rmdir()
            except OSError:
                pass  # Directory not empty or doesn't exist

    async def _write_json_file(self, file_path: Path, data: dict, context: Optional[ErrorContext] = None):
        """Write JSON data to file atomically."""
        import json
        json_content = json.dumps(data, indent=2)
        await self._atomic_write_file(file_path, json_content, context)

    async def _read_json_file(self, file_path: Path) -> dict:
        """Read JSON data from file."""
        import json
        content = file_path.read_text(encoding='utf-8')
        return json.loads(content)

    def _map_os_error_to_exception(self, os_error: OSError, operation: str, context: ErrorContext):
        """Map OS errors to appropriate domain exceptions."""
        from infrastructure.exceptions import (
            ResourceNotFoundError, ValidationError, DatabaseError
        )

        if os_error.errno == 2:  # No such file or directory
            return ResourceNotFoundError("File", operation, context)
        elif os_error.errno == 13:  # Permission denied
            return ValidationError("permissions", operation, "Permission denied", context)
        elif os_error.errno == 28:  # No space left on device
            return DatabaseError(f"Insufficient disk space for {operation}", os_error, context)
        elif os_error.errno == 17:  # File exists
            return DuplicateResourceError("File", operation, context)
        else:
            return DatabaseError(f"Filesystem error during {operation}", os_error, context)