""" Document manager for high-performance markdown file ingestion and AST caching. This module implements the core functionality for Issue #2: Fast Document Loading & CLI Manipulation. It provides performance-optimized document processing through AST caching and database integration. Key Features: - Parse once, access many times architecture - AST cache loading < 50% of markdown parsing time - Seamless integration with Issue #1 database foundation - Comprehensive error handling and validation """ import json import time from pathlib import Path from typing import Dict, Any, Optional from .parser import parse_markdown_to_ast from .frontmatter import FrontMatterParser class DocumentManager: """ High-performance document manager for markdown file processing. Implements the "parse once, manipulate many times" architecture by creating fast-loading AST cache files alongside database metadata storage. Architecture: markdown file → AST parsing → cache file + database metadata Performance Goal: Cache loading must be < 50% of original parsing time Attributes: db_manager: Database manager for metadata storage cache_dir: Directory for AST cache files frontmatter_parser: YAML front matter processor """ def __init__(self, database_manager, cache_dir: Optional[Path] = None): """ Initialize document manager with database and cache configuration. Args: database_manager: DatabaseManager instance for metadata storage cache_dir: Directory for AST cache files (default: .ast_cache) """ self.db_manager = database_manager self.cache_dir = Path(cache_dir) if cache_dir else Path(".ast_cache") self.cache_dir.mkdir(exist_ok=True) self.frontmatter_parser = FrontMatterParser() def ingest_file(self, file_path: Path) -> Dict[str, Any]: """ Ingest a markdown file with performance-optimized AST caching. Implements the core "parse once, manipulate many times" workflow: 1. Validates file existence 2. Parses markdown content to AST 3. Creates fast-loading AST cache file 4. Stores metadata in database 5. Returns processing results with performance metrics Args: file_path: Path to markdown file to ingest Returns: Dictionary containing: - ast: Parsed AST representation - metadata: File metadata (filename, title, etc.) - ast_cache_path: Path to created cache file - parse_time: Time spent parsing markdown (seconds) - cache_time: Time spent creating cache (seconds) Raises: FileNotFoundError: If the specified file doesn't exist Performance: Initial parse creates overhead, but subsequent cache loads will be < 50% of this parse time. """ # Validate file exists if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") # Read file content content = self._read_file_content(file_path) # Parse front matter for metadata extraction front_matter, markdown_content = self.frontmatter_parser.parse(content) # Parse to AST with performance timing ast, parse_time = self._parse_content_to_ast(content) # Create cache file with performance timing cache_file, cache_time = self._create_performance_cache(file_path.name, ast) # Store in database (handles front matter parsing internally) self._store_in_database(file_path.name, content) # Return comprehensive result return self._build_ingestion_result( ast=ast, filename=file_path.name, front_matter=front_matter, cache_file=cache_file, parse_time=parse_time, cache_time=cache_time ) def _read_file_content(self, file_path: Path) -> str: """ Read file content with proper encoding. Args: file_path: Path to file to read Returns: File content as string """ return file_path.read_text(encoding='utf-8') def _parse_content_to_ast(self, content: str) -> tuple[list, float]: """ Parse markdown content to AST with performance timing. Args: content: Raw markdown content Returns: Tuple of (AST tokens, parse_time_seconds) """ start_time = time.time() ast = parse_markdown_to_ast(content) parse_time = time.time() - start_time return ast, parse_time def _create_performance_cache(self, filename: str, ast: list) -> tuple[Path, float]: """ Create AST cache file with performance timing. Args: filename: Source filename for cache naming ast: AST tokens to cache Returns: Tuple of (cache_file_path, cache_time_seconds) """ start_time = time.time() cache_file = self._create_ast_cache(filename, ast) cache_time = time.time() - start_time return cache_file, cache_time def _store_in_database(self, filename: str, content: str) -> None: """ Store document in database using existing API. Args: filename: Name of the file content: Full markdown content (including front matter) Note: The database manager handles front matter parsing internally. """ self.db_manager.store_markdown_file(filename, content) def _build_ingestion_result(self, ast: list, filename: str, front_matter: dict, cache_file: Path, parse_time: float, cache_time: float) -> Dict[str, Any]: """ Build comprehensive ingestion result dictionary. Args: ast: Parsed AST tokens filename: Source filename front_matter: Parsed front matter metadata cache_file: Path to created cache file parse_time: Time spent parsing (seconds) cache_time: Time spent caching (seconds) Returns: Structured result dictionary with all ingestion data """ return { 'ast': ast, 'metadata': { 'filename': filename, 'title': front_matter.get('title', ''), }, 'ast_cache_path': cache_file, 'parse_time': parse_time, 'cache_time': cache_time } def _create_ast_cache(self, filename: str, ast: list) -> Path: """ Create AST cache file in JSON format. Args: filename: Source filename for cache naming ast: AST tokens to serialize Returns: Path to created cache file """ cache_filename = f"{filename}.ast.json" cache_path = self.cache_dir / cache_filename with open(cache_path, 'w', encoding='utf-8') as f: json.dump(ast, f, indent=2, ensure_ascii=False) return cache_path