""" Directory to Space Importer. Imports directory content into an Information Space, handling various directory structures and conflict detection. """ import hashlib import json import logging from dataclasses import dataclass, field from datetime import datetime from pathlib import Path from typing import Dict, Any, Optional, List, Set, Tuple from ..models import InformationSpace, SpaceDocument, SpaceStatus from ..events import EventBus, SpaceEventType, SpaceEvent logger = logging.getLogger(__name__) @dataclass class ImportConfig: """ Configuration for directory import. Attributes: file_patterns: Glob patterns for files to import (default: *.md) recursive: Whether to import recursively ignore_patterns: Patterns to ignore preserve_structure: Whether to preserve directory structure in space_path conflict_strategy: How to handle conflicts ('skip', 'overwrite', 'rename') import_metadata: Whether to read .markitect-* metadata files """ file_patterns: List[str] = field(default_factory=lambda: ["*.md", "*.markdown"]) recursive: bool = True ignore_patterns: List[str] = field( default_factory=lambda: [".*", "__pycache__", "node_modules"] ) preserve_structure: bool = True conflict_strategy: str = "skip" # skip, overwrite, rename import_metadata: bool = True @dataclass class ImportedDocument: """ Record of an imported document. Attributes: file_path: Source file path space_path: Path in space document_id: Assigned document ID content_hash: Hash of imported content size: Content size in bytes is_new: Whether this is a new document """ file_path: Path space_path: str document_id: str content_hash: str size: int is_new: bool = True @dataclass class ImportConflict: """ Record of an import conflict. Attributes: file_path: Source file path space_path: Target space path reason: Conflict reason resolution: How conflict was resolved """ file_path: Path space_path: str reason: str resolution: str @dataclass class ImportResult: """ Result of an import operation. Attributes: source_directory: Imported directory space_id: Target space ID (if existing) imported_documents: Successfully imported documents conflicts: Conflicts encountered errors: Any errors space_metadata: Imported space metadata if found duration_ms: Import duration in milliseconds """ source_directory: Path space_id: Optional[str] = None imported_documents: List[ImportedDocument] = field(default_factory=list) conflicts: List[ImportConflict] = field(default_factory=list) errors: Dict[str, str] = field(default_factory=dict) space_metadata: Optional[Dict[str, Any]] = None duration_ms: int = 0 @property def success(self) -> bool: """Check if import was successful.""" return len(self.errors) == 0 @property def document_count(self) -> int: """Total number of imported documents.""" return len(self.imported_documents) class DirectorySpaceImporter: """ Imports directory content into Information Space. Features: - Multiple file pattern support - Recursive directory scanning - Conflict detection and resolution - Metadata file handling - Event emission for progress tracking """ def __init__( self, config: Optional[ImportConfig] = None, event_bus: Optional[EventBus] = None, ): """ Initialize the importer. Args: config: Import configuration event_bus: Event bus for notifications """ self.config = config or ImportConfig() self.event_bus = event_bus def scan_directory(self, source_directory: Path) -> List[Path]: """ Scan directory for importable files. Args: source_directory: Directory to scan Returns: List of file paths to import """ if not source_directory.exists(): raise ValueError(f"Directory does not exist: {source_directory}") files = [] for pattern in self.config.file_patterns: if self.config.recursive: matches = source_directory.rglob(pattern) else: matches = source_directory.glob(pattern) for path in matches: if self._should_include(path, source_directory): files.append(path) return sorted(files) def import_directory( self, source_directory: Path, existing_documents: Optional[Dict[str, SpaceDocument]] = None, document_creator: Optional[callable] = None, ) -> ImportResult: """ Import directory content. Args: source_directory: Directory to import existing_documents: Map of space_path to existing documents document_creator: Function(space_path, content) -> document_id Returns: ImportResult with details of the import """ start_time = datetime.now() result = ImportResult(source_directory=source_directory) existing_documents = existing_documents or {} self._emit_event( SpaceEventType.SYNC_STARTED, result.space_id or "pending", {"direction": "import", "source": str(source_directory)}, ) try: # Load space metadata if available result.space_metadata = self._load_space_metadata(source_directory) if result.space_metadata: result.space_id = result.space_metadata.get("id") # Scan for files files = self.scan_directory(source_directory) # Import each file for file_path in files: try: imported = self._import_file( file_path, source_directory, existing_documents, document_creator, result, ) if imported: result.imported_documents.append(imported) except Exception as e: logger.error(f"Failed to import {file_path}: {e}") result.errors[str(file_path)] = str(e) # Calculate duration end_time = datetime.now() result.duration_ms = int((end_time - start_time).total_seconds() * 1000) self._emit_event( SpaceEventType.SYNC_COMPLETED, result.space_id or "imported", { "direction": "import", "document_count": result.document_count, "conflicts": len(result.conflicts), "errors": len(result.errors), }, ) except Exception as e: logger.error(f"Import failed: {e}") result.errors["_import"] = str(e) return result def _should_include(self, path: Path, base_dir: Path) -> bool: """Check if a path should be included in import.""" # Skip directories if path.is_dir(): return False # Check ignore patterns relative_parts = path.relative_to(base_dir).parts for pattern in self.config.ignore_patterns: for part in relative_parts: if part.startswith(pattern.rstrip("*")): return False if pattern.startswith(".") and part.startswith("."): return False return True def _import_file( self, file_path: Path, source_directory: Path, existing_documents: Dict[str, SpaceDocument], document_creator: Optional[callable], result: ImportResult, ) -> Optional[ImportedDocument]: """Import a single file.""" # Determine space path space_path = self._file_to_space_path(file_path, source_directory) # Read content try: content = file_path.read_text(encoding="utf-8") except Exception as e: raise ValueError(f"Failed to read file: {e}") content_hash = self._compute_hash(content) size = len(content.encode("utf-8")) # Check for existing document existing_doc = existing_documents.get(space_path) if existing_doc: # Handle conflict conflict = self._handle_conflict( file_path, space_path, content_hash, existing_doc ) if conflict: result.conflicts.append(conflict) if conflict.resolution == "skip": return None # Create or update document is_new = existing_doc is None if document_creator: document_id = document_creator(space_path, content) else: # Generate a simple document ID document_id = self._generate_document_id(space_path) return ImportedDocument( file_path=file_path, space_path=space_path, document_id=document_id, content_hash=content_hash, size=size, is_new=is_new, ) def _file_to_space_path(self, file_path: Path, source_directory: Path) -> str: """Convert file path to space path.""" if self.config.preserve_structure: relative = file_path.relative_to(source_directory) return "/" + str(relative).replace("\\", "/") else: return "/" + file_path.name def _handle_conflict( self, file_path: Path, space_path: str, content_hash: str, existing_doc: SpaceDocument, ) -> Optional[ImportConflict]: """Handle a conflict with existing document.""" # Check if content is actually different existing_hash = getattr(existing_doc, "content_hash", None) if existing_hash == content_hash: return None # No actual conflict, content is same if self.config.conflict_strategy == "skip": return ImportConflict( file_path=file_path, space_path=space_path, reason="document_exists", resolution="skip", ) elif self.config.conflict_strategy == "overwrite": return ImportConflict( file_path=file_path, space_path=space_path, reason="document_exists", resolution="overwrite", ) elif self.config.conflict_strategy == "rename": return ImportConflict( file_path=file_path, space_path=space_path, reason="document_exists", resolution="rename", ) return None def _generate_document_id(self, space_path: str) -> str: """Generate a document ID from space path.""" import uuid return str(uuid.uuid4()) def _compute_hash(self, content: str) -> str: """Compute hash of content.""" return hashlib.sha256(content.encode("utf-8")).hexdigest()[:16] def _load_space_metadata(self, directory: Path) -> Optional[Dict[str, Any]]: """Load space metadata from .markitect-space.json.""" if not self.config.import_metadata: return None metadata_path = directory / ".markitect-space.json" if metadata_path.exists(): try: return json.loads(metadata_path.read_text(encoding="utf-8")) except Exception as e: logger.warning(f"Failed to load space metadata: {e}") return None def _load_manifest(self, directory: Path) -> Optional[Dict[str, Any]]: """Load export manifest from .markitect-manifest.json.""" manifest_path = directory / ".markitect-manifest.json" if manifest_path.exists(): try: return json.loads(manifest_path.read_text(encoding="utf-8")) except Exception as e: logger.warning(f"Failed to load manifest: {e}") return None def _emit_event( self, event_type: SpaceEventType, space_id: str, payload: Dict[str, Any] ) -> None: """Emit an event if event bus is available.""" if not self.event_bus: return event = SpaceEvent( event_type=event_type, space_id=space_id, payload=payload, ) self.event_bus.emit(event) class ManifestImporter(DirectorySpaceImporter): """ Importer that uses manifest for intelligent reimport. Uses the .markitect-manifest.json to detect changes and only import modified files. """ def __init__( self, config: Optional[ImportConfig] = None, event_bus: Optional[EventBus] = None, ): """Initialize manifest-aware importer.""" super().__init__(config, event_bus) self._manifest: Optional[Dict[str, Any]] = None def import_with_manifest( self, source_directory: Path, existing_documents: Optional[Dict[str, SpaceDocument]] = None, document_creator: Optional[callable] = None, ) -> ImportResult: """ Import using manifest for change detection. Args: source_directory: Directory to import existing_documents: Existing documents document_creator: Document creator function Returns: ImportResult """ # Load manifest self._manifest = self._load_manifest(source_directory) if self._manifest: logger.info( f"Using manifest from previous export at " f"{self._manifest.get('exported_at', 'unknown')}" ) return self.import_directory( source_directory, existing_documents, document_creator ) def _get_manifest_hash(self, space_path: str) -> Optional[str]: """Get content hash from manifest for a space path.""" if not self._manifest: return None for file_info in self._manifest.get("files", []): if file_info.get("space_path") == space_path: return file_info.get("content_hash") return None def has_changed(self, space_path: str, current_hash: str) -> bool: """Check if file has changed since last export.""" manifest_hash = self._get_manifest_hash(space_path) if manifest_hash is None: return True # New file return manifest_hash != current_hash