markitect-main/markitect/document_manager.py

"""
Document manager for high-performance markdown file ingestion and AST caching.

This module implements the core functionality for Issue #2: Fast Document Loading & CLI Manipulation.
It provides performance-optimized document processing through AST caching and database integration.

Key Features:
- Parse once, access many times architecture
- AST cache loading < 50% of markdown parsing time
- Seamless integration with Issue #1 database foundation
- Comprehensive error handling and validation
"""

import json
import time
from pathlib import Path
from typing import Dict, Any, Optional

from .parser import parse_markdown_to_ast
from .frontmatter import FrontMatterParser


class DocumentManager:
    """
    High-performance document manager for markdown file processing.

    Implements the "parse once, manipulate many times" architecture by creating
    fast-loading AST cache files alongside database metadata storage.

    Architecture:
        markdown file → AST parsing → cache file + database metadata

    Performance Goal:
        Cache loading must be < 50% of original parsing time

    Attributes:
        db_manager: Database manager for metadata storage
        cache_dir: Directory for AST cache files
        frontmatter_parser: YAML front matter processor
    """

    def __init__(self, database_manager, cache_dir: Optional[Path] = None):
        """
        Initialize document manager with database and cache configuration.

        Args:
            database_manager: DatabaseManager instance for metadata storage
            cache_dir: Directory for AST cache files (default: .ast_cache)
        """
        self.db_manager = database_manager
        self.cache_dir = Path(cache_dir) if cache_dir else Path(".ast_cache")
        self.cache_dir.mkdir(exist_ok=True)
        self.frontmatter_parser = FrontMatterParser()

    def ingest_file(self, file_path: Path) -> Dict[str, Any]:
        """
        Ingest a markdown file with performance-optimized AST caching.

        Implements the core "parse once, manipulate many times" workflow:
        1. Validates file existence
        2. Parses markdown content to AST
        3. Creates fast-loading AST cache file
        4. Stores metadata in database
        5. Returns processing results with performance metrics

        Args:
            file_path: Path to markdown file to ingest

        Returns:
            Dictionary containing:
                - ast: Parsed AST representation
                - metadata: File metadata (filename, title, etc.)
                - ast_cache_path: Path to created cache file
                - parse_time: Time spent parsing markdown (seconds)
                - cache_time: Time spent creating cache (seconds)

        Raises:
            FileNotFoundError: If the specified file doesn't exist

        Performance:
            Initial parse creates overhead, but subsequent cache loads
            will be < 50% of this parse time.
        """
        # Validate file exists
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        # Read file content
        content = self._read_file_content(file_path)

        # Parse front matter for metadata extraction
        front_matter, markdown_content = self.frontmatter_parser.parse(content)

        # Parse to AST with performance timing
        ast, parse_time = self._parse_content_to_ast(content)

        # Create cache file with performance timing
        cache_file, cache_time = self._create_performance_cache(file_path.name, ast)

        # Store in database (handles front matter parsing internally)
        self._store_in_database(file_path.name, content)

        # Return comprehensive result
        return self._build_ingestion_result(
            ast=ast,
            filename=file_path.name,
            front_matter=front_matter,
            cache_file=cache_file,
            parse_time=parse_time,
            cache_time=cache_time
        )

    def _read_file_content(self, file_path: Path) -> str:
        """
        Read file content with proper encoding.

        Args:
            file_path: Path to file to read

        Returns:
            File content as string
        """
        return file_path.read_text(encoding='utf-8')

    def _parse_content_to_ast(self, content: str) -> tuple[list, float]:
        """
        Parse markdown content to AST with performance timing.

        Args:
            content: Raw markdown content

        Returns:
            Tuple of (AST tokens, parse_time_seconds)
        """
        start_time = time.time()
        ast = parse_markdown_to_ast(content)
        parse_time = time.time() - start_time
        return ast, parse_time

    def _create_performance_cache(self, filename: str, ast: list) -> tuple[Path, float]:
        """
        Create AST cache file with performance timing.

        Args:
            filename: Source filename for cache naming
            ast: AST tokens to cache

        Returns:
            Tuple of (cache_file_path, cache_time_seconds)
        """
        start_time = time.time()
        cache_file = self._create_ast_cache(filename, ast)
        cache_time = time.time() - start_time
        return cache_file, cache_time

    def _store_in_database(self, filename: str, content: str) -> None:
        """
        Store document in database using existing API.

        Args:
            filename: Name of the file
            content: Full markdown content (including front matter)

        Note:
            The database manager handles front matter parsing internally.
        """
        self.db_manager.store_markdown_file(filename, content)

    def _build_ingestion_result(self, ast: list, filename: str, front_matter: dict,
                               cache_file: Path, parse_time: float, cache_time: float) -> Dict[str, Any]:
        """
        Build comprehensive ingestion result dictionary.

        Args:
            ast: Parsed AST tokens
            filename: Source filename
            front_matter: Parsed front matter metadata
            cache_file: Path to created cache file
            parse_time: Time spent parsing (seconds)
            cache_time: Time spent caching (seconds)

        Returns:
            Structured result dictionary with all ingestion data
        """
        return {
            'ast': ast,
            'metadata': {
                'filename': filename,
                'title': front_matter.get('title', ''),
            },
            'ast_cache_path': cache_file,
            'parse_time': parse_time,
            'cache_time': cache_time
        }

    def _create_ast_cache(self, filename: str, ast: list) -> Path:
        """
        Create AST cache file in JSON format.

        Args:
            filename: Source filename for cache naming
            ast: AST tokens to serialize

        Returns:
            Path to created cache file
        """
        cache_filename = f"{filename}.ast.json"
        cache_path = self.cache_dir / cache_filename

        with open(cache_path, 'w', encoding='utf-8') as f:
            json.dump(ast, f, indent=2, ensure_ascii=False)

        return cache_path