Files
markitect-main/infrastructure/repositories/filesystem_repository.py
tegwick 1fa0f1e84a
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
fix: Eliminate all 111 test warnings by fixing root causes
- Replace deprecated datetime.utcnow() with datetime.now(timezone.utc)
  across all domain models, services, infrastructure, and test files
- Add missing timezone imports to all affected files
- Fix pytest.ini configuration format from [tool:pytest] to [pytest]
- Remove warning suppressions to expose actual issues
- Ensure proper pytest marker registration for smoke tests

Results:
- 305 passed, 2 skipped, 0 warnings (down from 111 warnings)
- All functionality preserved with modern datetime API usage
- Improved code quality by addressing root causes vs suppression

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-09-27 20:14:22 +02:00

495 lines
18 KiB
Python

"""
Filesystem repository implementation with atomic operations.
Provides reliable file operations with proper error handling,
atomic writes, and workspace management.
"""
import os
import shutil
import tempfile
import uuid
from infrastructure.logging import get_logger
from typing import List, Optional
from pathlib import Path
from datetime import datetime, timedelta, timezone
from infrastructure.repositories.interfaces import WorkspaceRepository
from infrastructure.exceptions import (
ErrorContext, OperationType, ResourceNotFoundError,
DuplicateResourceError, ValidationError
)
logger = get_logger(__name__)
class FilesystemWorkspaceRepository(WorkspaceRepository):
"""
Filesystem implementation of WorkspaceRepository.
Provides reliable workspace and file operations with atomic writes,
proper validation, and comprehensive error handling.
"""
def __init__(self, base_workspace_dir: str = ".markitect_workspace"):
self.base_path = Path(base_workspace_dir).resolve()
self.base_path.mkdir(parents=True, exist_ok=True)
logger.info(f"Initialized workspace repository at {self.base_path}")
async def create_workspace(
self,
workspace_id: str,
base_path: Path,
context: Optional[ErrorContext] = None
) -> Path:
"""Create a new workspace directory."""
if context is None:
context = ErrorContext(
operation_id=f"create_workspace_{workspace_id}",
operation_type=OperationType.WRITE,
resource_type="Workspace",
resource_id=workspace_id
)
# Validate workspace ID
if not self._is_valid_workspace_id(workspace_id):
raise ValidationError(
"workspace_id",
workspace_id,
"Workspace ID must be alphanumeric with optional dashes and underscores",
context
)
workspace_path = self.base_path / workspace_id
# Check if workspace already exists
if workspace_path.exists():
raise DuplicateResourceError("Workspace", workspace_id, context)
try:
# Create workspace directory with proper permissions
workspace_path.mkdir(parents=True, exist_ok=False, mode=0o755)
# Create standard subdirectories
(workspace_path / "files").mkdir(exist_ok=True)
(workspace_path / "temp").mkdir(exist_ok=True)
(workspace_path / "logs").mkdir(exist_ok=True)
# Create workspace metadata file
metadata = {
"id": workspace_id,
"created_at": datetime.now(timezone.utc).isoformat(),
"version": "1.0",
"type": "markitect_workspace"
}
await self._write_json_file(
workspace_path / ".workspace_meta.json",
metadata,
context
)
logger.info(f"Created workspace: {workspace_id}")
return workspace_path
except OSError as e:
logger.error(f"Failed to create workspace {workspace_id}: {e}")
# Cleanup partial creation
if workspace_path.exists():
shutil.rmtree(workspace_path, ignore_errors=True)
raise self._map_os_error_to_exception(e, f"create workspace {workspace_id}", context)
async def get_workspace_path(
self,
workspace_id: str,
context: Optional[ErrorContext] = None
) -> Path:
"""Get the path to a workspace."""
if context is None:
context = ErrorContext(
operation_id=f"get_workspace_path_{workspace_id}",
operation_type=OperationType.READ,
resource_type="Workspace",
resource_id=workspace_id
)
workspace_path = self.base_path / workspace_id
if not workspace_path.exists() or not workspace_path.is_dir():
raise ResourceNotFoundError("Workspace", workspace_id, context)
return workspace_path
async def list_workspaces(
self,
context: Optional[ErrorContext] = None
) -> List[str]:
"""List all available workspaces."""
if context is None:
context = ErrorContext(
operation_id="list_workspaces",
operation_type=OperationType.READ,
resource_type="Workspace"
)
try:
workspaces = []
if not self.base_path.exists():
return workspaces
for item in self.base_path.iterdir():
if item.is_dir() and self._is_valid_workspace_id(item.name):
# Verify it's a valid workspace by checking for metadata
metadata_file = item / ".workspace_meta.json"
if metadata_file.exists():
workspaces.append(item.name)
return sorted(workspaces)
except OSError as e:
logger.error(f"Failed to list workspaces: {e}")
raise self._map_os_error_to_exception(e, "list workspaces", context)
async def write_file(
self,
workspace_id: str,
file_path: str,
content: str,
context: Optional[ErrorContext] = None
) -> Path:
"""Write content to a file in the workspace using atomic operations."""
if context is None:
context = ErrorContext(
operation_id=f"write_file_{workspace_id}_{file_path}",
operation_type=OperationType.WRITE,
resource_type="WorkspaceFile",
resource_id=f"{workspace_id}/{file_path}",
request_data={"content_length": len(content)}
)
# Validate inputs
workspace_path = await self.get_workspace_path(workspace_id, context)
if not self._is_safe_file_path(file_path):
raise ValidationError(
"file_path",
file_path,
"File path contains invalid characters or attempts directory traversal",
context
)
# Validate file extension
allowed_extensions = {".md", ".txt", ".py", ".js", ".json", ".yaml", ".yml", ".rst", ".csv"}
file_ext = Path(file_path).suffix.lower()
if file_ext and file_ext not in allowed_extensions:
raise ValidationError(
"file_path",
file_path,
f"File extension {file_ext} is not allowed",
context
)
# Validate content size (100MB limit)
max_size = 100 * 1024 * 1024 # 100MB
if len(content.encode('utf-8')) > max_size:
raise ValidationError(
"content",
f"{len(content)} characters",
f"File content exceeds maximum size of {max_size} bytes",
context
)
target_path = workspace_path / "files" / file_path
try:
# Ensure parent directory exists
target_path.parent.mkdir(parents=True, exist_ok=True)
# Atomic write using temporary file
await self._atomic_write_file(target_path, content, context)
logger.info(f"Wrote file {file_path} in workspace {workspace_id}")
return target_path
except OSError as e:
logger.error(f"Failed to write file {file_path} in workspace {workspace_id}: {e}")
raise self._map_os_error_to_exception(e, f"write file {file_path}", context)
async def read_file(
self,
workspace_id: str,
file_path: str,
context: Optional[ErrorContext] = None
) -> str:
"""Read content from a file in the workspace."""
if context is None:
context = ErrorContext(
operation_id=f"read_file_{workspace_id}_{file_path}",
operation_type=OperationType.READ,
resource_type="WorkspaceFile",
resource_id=f"{workspace_id}/{file_path}"
)
# Validate inputs
workspace_path = await self.get_workspace_path(workspace_id, context)
if not self._is_safe_file_path(file_path):
raise ValidationError(
"file_path",
file_path,
"File path contains invalid characters or attempts directory traversal",
context
)
target_path = workspace_path / "files" / file_path
if not target_path.exists():
raise ResourceNotFoundError("File", f"{workspace_id}/{file_path}", context)
if not target_path.is_file():
raise ValidationError(
"file_path",
file_path,
"Path exists but is not a regular file",
context
)
try:
# Read file with encoding detection
content = target_path.read_text(encoding='utf-8')
logger.debug(f"Read file {file_path} from workspace {workspace_id}")
return content
except UnicodeDecodeError as e:
logger.error(f"Failed to decode file {file_path} as UTF-8: {e}")
raise ValidationError(
"file_content",
"binary data",
"File does not contain valid UTF-8 text",
context
)
except OSError as e:
logger.error(f"Failed to read file {file_path} from workspace {workspace_id}: {e}")
raise self._map_os_error_to_exception(e, f"read file {file_path}", context)
async def delete_workspace(
self,
workspace_id: str,
context: Optional[ErrorContext] = None
) -> bool:
"""Delete a workspace and all its contents."""
if context is None:
context = ErrorContext(
operation_id=f"delete_workspace_{workspace_id}",
operation_type=OperationType.DELETE,
resource_type="Workspace",
resource_id=workspace_id
)
workspace_path = await self.get_workspace_path(workspace_id, context)
try:
# Use shutil.rmtree for recursive deletion
shutil.rmtree(workspace_path)
logger.info(f"Deleted workspace: {workspace_id}")
return True
except OSError as e:
logger.error(f"Failed to delete workspace {workspace_id}: {e}")
raise self._map_os_error_to_exception(e, f"delete workspace {workspace_id}", context)
async def list_files(
self,
workspace_id: str,
pattern: Optional[str] = None,
context: Optional[ErrorContext] = None
) -> List[str]:
"""List files in a workspace."""
if context is None:
context = ErrorContext(
operation_id=f"list_files_{workspace_id}",
operation_type=OperationType.READ,
resource_type="WorkspaceFile",
metadata={"workspace_id": workspace_id, "pattern": pattern}
)
workspace_path = await self.get_workspace_path(workspace_id, context)
files_dir = workspace_path / "files"
if not files_dir.exists():
return []
try:
files = []
# Walk through all files in the workspace
for item in files_dir.rglob("*"):
if item.is_file():
# Get relative path from files directory
relative_path = str(item.relative_to(files_dir))
# Apply pattern filter if provided
if pattern is None or self._matches_pattern(relative_path, pattern):
files.append(relative_path)
return sorted(files)
except OSError as e:
logger.error(f"Failed to list files in workspace {workspace_id}: {e}")
raise self._map_os_error_to_exception(e, f"list files in workspace {workspace_id}", context)
async def cleanup_old_workspaces(self, days_threshold: int = 30) -> int:
"""Clean up workspaces older than specified days."""
logger.info(f"Starting cleanup of workspaces older than {days_threshold} days")
try:
cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_threshold)
deleted_count = 0
if not self.base_path.exists():
return 0
for workspace_dir in self.base_path.iterdir():
if not workspace_dir.is_dir():
continue
try:
# Check workspace metadata for creation date
metadata_file = workspace_dir / ".workspace_meta.json"
if not metadata_file.exists():
continue
metadata = await self._read_json_file(metadata_file)
created_at_str = metadata.get("created_at")
if not created_at_str:
continue
created_at = datetime.fromisoformat(created_at_str.replace("Z", "+00:00"))
if created_at < cutoff_date:
await self.delete_workspace(workspace_dir.name)
deleted_count += 1
logger.info(f"Cleaned up old workspace: {workspace_dir.name}")
except Exception as e:
logger.warning(f"Failed to process workspace {workspace_dir.name} during cleanup: {e}")
continue
logger.info(f"Cleanup completed: deleted {deleted_count} old workspaces")
return deleted_count
except Exception as e:
logger.error(f"Error during workspace cleanup: {e}")
return 0
# Helper methods
def _is_valid_workspace_id(self, workspace_id: str) -> bool:
"""Validate workspace ID format."""
if not workspace_id or len(workspace_id) > 100:
return False
# Allow alphanumeric, dash, underscore
import re
return re.match(r'^[a-zA-Z0-9_-]+$', workspace_id) is not None
def _is_safe_file_path(self, file_path: str) -> bool:
"""Check if file path is safe (no directory traversal)."""
if not file_path:
return False
# Normalize path
normalized = os.path.normpath(file_path)
# Check for directory traversal attempts
if normalized.startswith("..") or "/.." in normalized or "\\.." in normalized:
return False
# Check for absolute paths
if os.path.isabs(normalized):
return False
# Check for unsafe characters
unsafe_chars = {"<", ">", ":", "\"", "|", "?", "*", "\0"}
if any(char in file_path for char in unsafe_chars):
return False
return True
def _matches_pattern(self, file_path: str, pattern: str) -> bool:
"""Check if file path matches the given pattern."""
import fnmatch
return fnmatch.fnmatch(file_path.lower(), pattern.lower())
async def _atomic_write_file(self, target_path: Path, content: str, context: ErrorContext):
"""Write file atomically using temporary file."""
temp_dir = target_path.parent / ".tmp"
temp_dir.mkdir(exist_ok=True)
# Create temporary file in same directory as target
temp_fd, temp_path = tempfile.mkstemp(
dir=temp_dir,
prefix=f".tmp_{target_path.name}_",
suffix=".tmp"
)
try:
# Write content to temporary file
with os.fdopen(temp_fd, 'w', encoding='utf-8') as f:
f.write(content)
f.flush()
os.fsync(f.fileno()) # Ensure data is written to disk
# Atomic move to final location
temp_path_obj = Path(temp_path)
temp_path_obj.replace(target_path)
except Exception:
# Clean up temporary file on error
try:
os.unlink(temp_path)
except OSError:
pass
raise
finally:
# Clean up temp directory if empty
try:
temp_dir.rmdir()
except OSError:
pass # Directory not empty or doesn't exist
async def _write_json_file(self, file_path: Path, data: dict, context: Optional[ErrorContext] = None):
"""Write JSON data to file atomically."""
import json
json_content = json.dumps(data, indent=2)
await self._atomic_write_file(file_path, json_content, context)
async def _read_json_file(self, file_path: Path) -> dict:
"""Read JSON data from file."""
import json
content = file_path.read_text(encoding='utf-8')
return json.loads(content)
def _map_os_error_to_exception(self, os_error: OSError, operation: str, context: ErrorContext):
"""Map OS errors to appropriate domain exceptions."""
from infrastructure.exceptions import (
ResourceNotFoundError, ValidationError, DatabaseError
)
if os_error.errno == 2: # No such file or directory
return ResourceNotFoundError("File", operation, context)
elif os_error.errno == 13: # Permission denied
return ValidationError("permissions", operation, "Permission denied", context)
elif os_error.errno == 28: # No space left on device
return DatabaseError(f"Insufficient disk space for {operation}", os_error, context)
elif os_error.errno == 17: # File exists
return DuplicateResourceError("File", operation, context)
else:
return DatabaseError(f"Filesystem error during {operation}", os_error, context)