""" Document manager for high-performance markdown file ingestion and AST caching. This module implements the core functionality for Issue #2: Fast Document Loading & CLI Manipulation. It provides performance-optimized document processing through AST caching and database integration. Key Features: - Parse once, access many times architecture - AST cache loading < 50% of markdown parsing time - Seamless integration with Issue #1 database foundation - Comprehensive error handling and validation """ import json import time from pathlib import Path from typing import Dict, Any, Optional from .parser import parse_markdown_to_ast from .frontmatter import FrontMatterParser class DocumentManager: """ High-performance document manager for markdown file processing. Implements the "parse once, manipulate many times" architecture by creating fast-loading AST cache files alongside database metadata storage. Architecture: markdown file → AST parsing → cache file + database metadata Performance Goal: Cache loading must be < 50% of original parsing time Attributes: db_manager: Database manager for metadata storage cache_dir: Directory for AST cache files frontmatter_parser: YAML front matter processor """ def __init__(self, database_manager, cache_dir: Optional[Path] = None): """ Initialize document manager with database and cache configuration. Args: database_manager: DatabaseManager instance for metadata storage cache_dir: Directory for AST cache files (default: .ast_cache) """ self.db_manager = database_manager self.cache_dir = Path(cache_dir) if cache_dir else Path(".ast_cache") self.cache_dir.mkdir(exist_ok=True) self.frontmatter_parser = FrontMatterParser() def ingest_file(self, file_path: Path) -> Dict[str, Any]: """ Ingest a markdown file with performance-optimized AST caching. Implements the core "parse once, manipulate many times" workflow: 1. Validates file existence 2. Parses markdown content to AST 3. Creates fast-loading AST cache file 4. Stores metadata in database 5. Returns processing results with performance metrics Args: file_path: Path to markdown file to ingest Returns: Dictionary containing: - ast: Parsed AST representation - metadata: File metadata (filename, title, etc.) - ast_cache_path: Path to created cache file - parse_time: Time spent parsing markdown (seconds) - cache_time: Time spent creating cache (seconds) Raises: FileNotFoundError: If the specified file doesn't exist Performance: Initial parse creates overhead, but subsequent cache loads will be < 50% of this parse time. """ # Validate file exists if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") # Read file content content = self._read_file_content(file_path) # Parse front matter for metadata extraction front_matter, markdown_content = self.frontmatter_parser.parse(content) # Parse to AST with performance timing ast, parse_time = self._parse_content_to_ast(content) # Create cache file with performance timing cache_file, cache_time = self._create_performance_cache(file_path.name, ast) # Store in database (handles front matter parsing internally) self._store_in_database(file_path.name, content) # Return comprehensive result return self._build_ingestion_result( ast=ast, filename=file_path.name, front_matter=front_matter, cache_file=cache_file, parse_time=parse_time, cache_time=cache_time ) def _read_file_content(self, file_path: Path) -> str: """ Read file content with proper encoding. Args: file_path: Path to file to read Returns: File content as string """ return file_path.read_text(encoding='utf-8') def _parse_content_to_ast(self, content: str) -> tuple[list, float]: """ Parse markdown content to AST with performance timing. Args: content: Raw markdown content Returns: Tuple of (AST tokens, parse_time_seconds) """ start_time = time.time() ast = parse_markdown_to_ast(content) parse_time = time.time() - start_time return ast, parse_time def _create_performance_cache(self, filename: str, ast: list) -> tuple[Path, float]: """ Create AST cache file with performance timing. Args: filename: Source filename for cache naming ast: AST tokens to cache Returns: Tuple of (cache_file_path, cache_time_seconds) """ start_time = time.time() cache_file = self._create_ast_cache(filename, ast) cache_time = time.time() - start_time return cache_file, cache_time def _store_in_database(self, filename: str, content: str) -> None: """ Store document in database using existing API. Args: filename: Name of the file content: Full markdown content (including front matter) Note: The database manager handles front matter parsing internally. """ self.db_manager.store_markdown_file(filename, content) def _build_ingestion_result(self, ast: list, filename: str, front_matter: dict, cache_file: Path, parse_time: float, cache_time: float) -> Dict[str, Any]: """ Build comprehensive ingestion result dictionary. Args: ast: Parsed AST tokens filename: Source filename front_matter: Parsed front matter metadata cache_file: Path to created cache file parse_time: Time spent parsing (seconds) cache_time: Time spent caching (seconds) Returns: Structured result dictionary with all ingestion data """ return { 'ast': ast, 'metadata': { 'filename': filename, 'title': front_matter.get('title', ''), }, 'ast_cache_path': cache_file, 'parse_time': parse_time, 'cache_time': cache_time } def _create_ast_cache(self, filename: str, ast: list) -> Path: """ Create AST cache file in JSON format. Args: filename: Source filename for cache naming ast: AST tokens to serialize Returns: Path to created cache file """ cache_filename = f"{filename}.ast.json" cache_path = self.cache_dir / cache_filename with open(cache_path, 'w', encoding='utf-8') as f: json.dump(ast, f, indent=2, ensure_ascii=False) return cache_path def list_files(self) -> list: """ List all markdown files in the system. Returns: List of dictionaries containing file metadata including filename, size, and modification date information. """ # Get files from database db_files = self.db_manager.list_markdown_files() # Enhance with file system information enhanced_files = [] for file_info in db_files: enhanced_info = { 'filename': file_info['filename'], 'id': file_info['id'], 'created_at': file_info['created_at'], 'front_matter': file_info['front_matter'] } # Try to get file system stats if file exists try: file_path = Path(file_info['filename']) if file_path.exists(): stat = file_path.stat() enhanced_info['size'] = f"{stat.st_size} bytes" enhanced_info['modified'] = stat.st_mtime else: enhanced_info['size'] = 'unknown' enhanced_info['modified'] = 'file not found' except Exception: enhanced_info['size'] = 'unknown' enhanced_info['modified'] = 'unknown' enhanced_files.append(enhanced_info) return enhanced_files def render_file(self, input_file: str, output_file: str, template: str = None, css: str = None, edit_mode: bool = False, editor_theme: str = 'github', keyboard_shortcuts: bool = True) -> Dict[str, Any]: """ Render a markdown file to HTML with client-side rendering capabilities. Creates an HTML file with embedded markdown content that is rendered client-side using JavaScript markdown parser. Args: input_file: Path to input markdown file output_file: Path to output HTML file template: Template to use (optional) css: CSS file to include (optional) Returns: Dictionary with rendering results and metadata Raises: FileNotFoundError: If input file doesn't exist """ import json input_path = Path(input_file) output_path = Path(output_file) # Validate input file exists if not input_path.exists(): raise FileNotFoundError(f"Input file not found: {input_path}") # Read markdown content markdown_content = input_path.read_text(encoding='utf-8') # Extract title from markdown (first h1 heading) title = self._extract_title_from_markdown(markdown_content) # Generate HTML content html_content = self._generate_html_template( markdown_content=markdown_content, title=title, css=css, template=template, edit_mode=edit_mode, editor_theme=editor_theme, keyboard_shortcuts=keyboard_shortcuts ) # Write HTML file output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(html_content, encoding='utf-8') return { 'input_file': str(input_path), 'output_file': str(output_path), 'title': title, 'template': template, 'css': css } def _extract_title_from_markdown(self, content: str) -> str: """Extract title from markdown content (first h1 heading).""" import re # Look for first h1 heading match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) if match: return match.group(1).strip() return "Markdown Document" def _generate_html_template(self, markdown_content: str, title: str, css: str = None, template: str = None, edit_mode: bool = False, editor_theme: str = 'github', keyboard_shortcuts: bool = True) -> str: """Generate HTML template with embedded markdown and client-side rendering.""" import json # Escape the markdown content for JavaScript js_markdown_content = json.dumps(markdown_content) # Handle CSS styles css_content = "" if css: # Try to read CSS file content and embed it try: css_path = Path(css) if css_path.exists(): css_file_content = css_path.read_text(encoding='utf-8') css_content = f"" else: # Fallback to link if file doesn't exist css_content = f'' except Exception: # Fallback to link on any error css_content = f'' # Get template-specific CSS template_css = self._get_template_css(template) # Default CSS for basic styling default_css = f""" """ # Add editor-specific content if in edit mode editor_scripts = "" editor_config = "" editor_css = "" body_classes = "" if edit_mode: body_classes = ' class="markitect-edit-mode"' editor_css = """ """ editor_config = f""" const MARKITECT_EDIT_MODE = true; const MARKITECT_EDITOR_CONFIG = {{ theme: '{editor_theme}', keyboardShortcuts: {str(keyboard_shortcuts).lower()}, autosave: true, sections: true }};""" editor_scripts = """ class MarkitectEditor { constructor() { this.initializeEditor(); this.setupKeyboardShortcuts(); } initializeEditor() { const header = document.createElement('div'); header.className = 'markitect-floating-header'; header.innerHTML = ` Ready `; document.body.insertBefore(header, document.body.firstChild); this.makeContentEditable(); } makeContentEditable() { const content = document.getElementById('markdown-content'); if (content) { content.addEventListener('click', this.handleSectionClick.bind(this)); this.markSections(content); } } markSections(element) { const sections = element.querySelectorAll('h1, h2, h3, h4, h5, h6, p, blockquote, pre, ul, ol'); sections.forEach((section, index) => { section.classList.add('markitect-section-editable'); section.setAttribute('data-section', index); }); } handleSectionClick(event) { const section = event.target.closest('.markitect-section-editable'); if (section && !section.querySelector('textarea')) { this.editSection(section); } } editSection(section) { const originalContent = section.innerHTML; const textarea = document.createElement('textarea'); textarea.value = this.htmlToMarkdown(originalContent); textarea.className = 'edit-mode'; textarea.addEventListener('blur', () => { section.innerHTML = marked.parse(textarea.value); this.markSections(section.parentElement); }); section.innerHTML = ''; section.appendChild(textarea); textarea.focus(); } htmlToMarkdown(html) { // Simple HTML to Markdown conversion return html.replace(/<[^>]*>/g, '').trim(); } setupKeyboardShortcuts() { if (MARKITECT_EDITOR_CONFIG.keyboardShortcuts) { document.addEventListener('keydown', (event) => { if (event.ctrlKey || event.metaKey) { switch(event.key) { case 's': event.preventDefault(); this.save(); break; case 'e': event.preventDefault(); this.togglePreview(); break; } } }); } } save() { document.getElementById('save-status').textContent = 'Saved!'; setTimeout(() => { document.getElementById('save-status').textContent = 'Ready'; }, 2000); } togglePreview() { console.log('Toggle preview mode'); } } let markitectEditor;""" html_template = f"""