""" Document manager for high-performance markdown file ingestion and AST caching. This module implements the core functionality for Issue #2: Fast Document Loading & CLI Manipulation. It provides performance-optimized document processing through AST caching and database integration. Key Features: - Parse once, access many times architecture - AST cache loading < 50% of markdown parsing time - Seamless integration with Issue #1 database foundation - Comprehensive error handling and validation """ import json import time from pathlib import Path from typing import Dict, Any, Optional from .parser import parse_markdown_to_ast from .frontmatter import FrontMatterParser class DocumentManager: """ High-performance document manager for markdown file processing. Implements the "parse once, manipulate many times" architecture by creating fast-loading AST cache files alongside database metadata storage. Architecture: markdown file → AST parsing → cache file + database metadata Performance Goal: Cache loading must be < 50% of original parsing time Attributes: db_manager: Database manager for metadata storage cache_dir: Directory for AST cache files frontmatter_parser: YAML front matter processor """ def __init__(self, database_manager, cache_dir: Optional[Path] = None): """ Initialize document manager with database and cache configuration. Args: database_manager: DatabaseManager instance for metadata storage cache_dir: Directory for AST cache files (default: .ast_cache) """ self.db_manager = database_manager self.cache_dir = Path(cache_dir) if cache_dir else Path(".ast_cache") self.cache_dir.mkdir(exist_ok=True) self.frontmatter_parser = FrontMatterParser() def ingest_file(self, file_path: Path) -> Dict[str, Any]: """ Ingest a markdown file with performance-optimized AST caching. Implements the core "parse once, manipulate many times" workflow: 1. Validates file existence 2. Parses markdown content to AST 3. Creates fast-loading AST cache file 4. Stores metadata in database 5. Returns processing results with performance metrics Args: file_path: Path to markdown file to ingest Returns: Dictionary containing: - ast: Parsed AST representation - metadata: File metadata (filename, title, etc.) - ast_cache_path: Path to created cache file - parse_time: Time spent parsing markdown (seconds) - cache_time: Time spent creating cache (seconds) Raises: FileNotFoundError: If the specified file doesn't exist Performance: Initial parse creates overhead, but subsequent cache loads will be < 50% of this parse time. """ # Validate file exists if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") # Read file content content = self._read_file_content(file_path) # Parse front matter for metadata extraction front_matter, markdown_content = self.frontmatter_parser.parse(content) # Parse to AST with performance timing ast, parse_time = self._parse_content_to_ast(content) # Create cache file with performance timing cache_file, cache_time = self._create_performance_cache(file_path.name, ast) # Store in database (handles front matter parsing internally) self._store_in_database(file_path.name, content) # Return comprehensive result return self._build_ingestion_result( ast=ast, filename=file_path.name, front_matter=front_matter, cache_file=cache_file, parse_time=parse_time, cache_time=cache_time ) def _read_file_content(self, file_path: Path) -> str: """ Read file content with proper encoding. Args: file_path: Path to file to read Returns: File content as string """ return file_path.read_text(encoding='utf-8') def _parse_content_to_ast(self, content: str) -> tuple[list, float]: """ Parse markdown content to AST with performance timing. Args: content: Raw markdown content Returns: Tuple of (AST tokens, parse_time_seconds) """ start_time = time.time() ast = parse_markdown_to_ast(content) parse_time = time.time() - start_time return ast, parse_time def _create_performance_cache(self, filename: str, ast: list) -> tuple[Path, float]: """ Create AST cache file with performance timing. Args: filename: Source filename for cache naming ast: AST tokens to cache Returns: Tuple of (cache_file_path, cache_time_seconds) """ start_time = time.time() cache_file = self._create_ast_cache(filename, ast) cache_time = time.time() - start_time return cache_file, cache_time def _store_in_database(self, filename: str, content: str) -> None: """ Store document in database using existing API. Args: filename: Name of the file content: Full markdown content (including front matter) Note: The database manager handles front matter parsing internally. """ self.db_manager.store_markdown_file(filename, content) def _build_ingestion_result(self, ast: list, filename: str, front_matter: dict, cache_file: Path, parse_time: float, cache_time: float) -> Dict[str, Any]: """ Build comprehensive ingestion result dictionary. Args: ast: Parsed AST tokens filename: Source filename front_matter: Parsed front matter metadata cache_file: Path to created cache file parse_time: Time spent parsing (seconds) cache_time: Time spent caching (seconds) Returns: Structured result dictionary with all ingestion data """ return { 'ast': ast, 'metadata': { 'filename': filename, 'title': front_matter.get('title', ''), }, 'ast_cache_path': cache_file, 'parse_time': parse_time, 'cache_time': cache_time } def _create_ast_cache(self, filename: str, ast: list) -> Path: """ Create AST cache file in JSON format. Args: filename: Source filename for cache naming ast: AST tokens to serialize Returns: Path to created cache file """ cache_filename = f"{filename}.ast.json" cache_path = self.cache_dir / cache_filename with open(cache_path, 'w', encoding='utf-8') as f: json.dump(ast, f, indent=2, ensure_ascii=False) return cache_path def list_files(self) -> list: """ List all markdown files in the system. Returns: List of dictionaries containing file metadata including filename, size, and modification date information. """ # Get files from database db_files = self.db_manager.list_markdown_files() # Enhance with file system information enhanced_files = [] for file_info in db_files: enhanced_info = { 'filename': file_info['filename'], 'id': file_info['id'], 'created_at': file_info['created_at'], 'front_matter': file_info['front_matter'] } # Try to get file system stats if file exists try: file_path = Path(file_info['filename']) if file_path.exists(): stat = file_path.stat() enhanced_info['size'] = f"{stat.st_size} bytes" enhanced_info['modified'] = stat.st_mtime else: enhanced_info['size'] = 'unknown' enhanced_info['modified'] = 'file not found' except Exception: enhanced_info['size'] = 'unknown' enhanced_info['modified'] = 'unknown' enhanced_files.append(enhanced_info) return enhanced_files def get_file(self, file_path: str) -> Dict[str, Any]: """ Retrieve a markdown file from the database. Args: file_path: Path to the markdown file to retrieve Returns: Dictionary containing file content and metadata Raises: FileNotFoundError: If file is not found in database """ if not self.db_manager: raise ValueError("Database manager not initialized") # Get file from database file_data = self.db_manager.get_markdown_file(file_path) if file_data is None: raise FileNotFoundError(f"File '{file_path}' not found in database") return { 'content': file_data.get('content', ''), 'metadata': { 'filename': file_data.get('filename', file_path), 'front_matter': file_data.get('front_matter'), 'size': len(file_data.get('content', '')), 'modified': file_data.get('modified') } } def render_file(self, input_file: str, output_file: str, template: str = None, css: str = None, edit_mode: bool = False, editor_theme: str = 'github', keyboard_shortcuts: bool = True) -> Dict[str, Any]: """ Render a markdown file to HTML with client-side rendering capabilities. Creates an HTML file with embedded markdown content that is rendered client-side using JavaScript markdown parser. Args: input_file: Path to input markdown file output_file: Path to output HTML file template: Template to use (optional) css: CSS file to include (optional) Returns: Dictionary with rendering results and metadata Raises: FileNotFoundError: If input file doesn't exist """ import json input_path = Path(input_file) output_path = Path(output_file) # Validate input file exists if not input_path.exists(): raise FileNotFoundError(f"Input file not found: {input_path}") # Read markdown content markdown_content = input_path.read_text(encoding='utf-8') # Extract title from markdown (first h1 heading) title = self._extract_title_from_markdown(markdown_content) # Generate HTML content html_content = self._generate_html_template( markdown_content=markdown_content, title=title, css=css, template=template, edit_mode=edit_mode, editor_theme=editor_theme, keyboard_shortcuts=keyboard_shortcuts ) # Write HTML file output_path.parent.mkdir(parents=True, exist_ok=True) output_path.write_text(html_content, encoding='utf-8') return { 'input_file': str(input_path), 'output_file': str(output_path), 'title': title, 'template': template, 'css': css } def _extract_title_from_markdown(self, content: str) -> str: """Extract title from markdown content (first h1 heading).""" import re # Look for first h1 heading match = re.search(r'^#\s+(.+)$', content, re.MULTILINE) if match: return match.group(1).strip() return "Markdown Document" def _generate_html_template(self, markdown_content: str, title: str, css: str = None, template: str = None, edit_mode: bool = False, editor_theme: str = 'github', keyboard_shortcuts: bool = True) -> str: """Generate HTML template with embedded markdown and client-side rendering.""" import json from pathlib import Path # Escape the markdown content for JavaScript js_markdown_content = json.dumps(markdown_content) # Handle CSS styles css_content = "" if css: # Try to read CSS file content and embed it try: css_path = Path(css) if css_path.exists(): css_file_content = css_path.read_text(encoding='utf-8') css_content = f"" else: # Fallback to link if file doesn't exist css_content = f'' except Exception: # Fallback to link on any error css_content = f'' # Get template-specific CSS template_css = self._get_template_css(template) # Default CSS for basic styling default_css = f""" """ # Add editor-specific content if in edit mode editor_scripts = "" editor_config = "" editor_css = "" body_classes = "" if edit_mode: body_classes = ' class="markitect-edit-mode"' editor_css = """ """ editor_config = f""" const MARKITECT_EDIT_MODE = true; const MARKITECT_EDITOR_CONFIG = {{ theme: '{editor_theme}', keyboardShortcuts: {str(keyboard_shortcuts).lower()}, autosave: true, sections: true }};""" editor_scripts = """ class MarkitectEditor { constructor() { this.hasEdits = false; // Track if any edits have been made this.initializeEditor(); this.setupKeyboardShortcuts(); } initializeEditor() { // Control panel is already in HTML, just make content editable this.makeContentEditable(); // Auto-expand control panel briefly to show it's available setTimeout(() => { const panel = document.getElementById('markitect-control-panel'); if (panel) { panel.classList.add('expanded'); setTimeout(() => { panel.classList.remove('expanded'); }, 2000); // Show for 2 seconds then minimize } }, 1000); } makeContentEditable() { const content = document.getElementById('markdown-content'); if (content) { content.addEventListener('click', this.handleSectionClick.bind(this)); this.markSections(content); } } markSections(element) { // Clear existing section markers (except edited ones) const existingSections = element.querySelectorAll('.markitect-section-editable:not([data-edited])'); existingSections.forEach(section => { section.classList.remove('markitect-section-editable'); section.removeAttribute('data-section'); }); // Mark new sections (skip elements inside edited wrappers) const sections = element.querySelectorAll('h1, h2, h3, h4, h5, h6, p, blockquote, pre, ul, ol'); let sectionIndex = 0; sections.forEach((section) => { // Skip if this element is inside an edited wrapper if (section.closest('[data-edited]')) { return; } // Skip if already marked as edited wrapper if (section.hasAttribute('data-edited')) { return; } section.classList.add('markitect-section-editable'); section.setAttribute('data-section', sectionIndex++); }); } handleSectionClick(event) { const section = event.target.closest('.markitect-section-editable'); if (section && !section.querySelector('textarea')) { this.editSection(section); } } editSection(section) { const originalContent = section.innerHTML; const textarea = document.createElement('textarea'); textarea.value = this.htmlToMarkdown(originalContent); textarea.className = 'edit-mode'; // Get original element font size and style const computedStyle = window.getComputedStyle(section); const originalFontSize = computedStyle.fontSize; const originalLineHeight = computedStyle.lineHeight; // Apply matching font size to textarea textarea.style.fontSize = originalFontSize; if (originalLineHeight !== 'normal') { textarea.style.lineHeight = originalLineHeight; } // Auto-sizing function const autoResize = () => { // Temporarily disable transition for accurate measurement const transition = textarea.style.transition; textarea.style.transition = 'none'; // Reset height to measure scrollHeight textarea.style.height = 'auto'; // Calculate based on actual content with more reasonable constraints const contentHeight = textarea.scrollHeight; const padding = 24; // 12px top + 12px bottom // More reasonable sizing: min 2 lines, max 15 lines const lineCount = textarea.value.split('\\n').length; const minHeight = Math.max(60, lineCount * 24 + padding); // ~24px per line const maxHeight = 360; // Maximum height constraint const newHeight = Math.max(60, Math.min(maxHeight, Math.max(minHeight, contentHeight + 4))); textarea.style.height = newHeight + 'px'; // Re-enable transition textarea.style.transition = transition; }; // Auto-resize on input and paste textarea.addEventListener('input', autoResize); textarea.addEventListener('paste', () => setTimeout(autoResize, 10)); // Initial sizing after DOM update setTimeout(autoResize, 20); textarea.addEventListener('blur', () => { this.hasEdits = true; // Mark that edits have been made // Check if the content contains paragraph breaks that should create separate sections const content = textarea.value.trim(); const paragraphs = content.split(/\\n\\s*\\n/).filter(p => p.trim()); if (paragraphs.length > 1) { // Multiple paragraphs - create separate sections const parentNode = section.parentNode; const sectionIndex = section.getAttribute('data-section'); const nextSibling = section.nextSibling; // Remember position // Remove the original section parentNode.removeChild(section); // Create separate sections for each paragraph and insert at correct position paragraphs.forEach((paragraph, index) => { const wrapper = document.createElement('div'); wrapper.innerHTML = marked.parse(paragraph.trim()); wrapper.classList.add('markitect-section-editable'); wrapper.setAttribute('data-section', sectionIndex + '_' + index); wrapper.setAttribute('data-edited', 'true'); // Insert at the correct position (before nextSibling) parentNode.insertBefore(wrapper, nextSibling); }); } else { // Single content block - create one wrapper const wrapper = document.createElement('div'); wrapper.innerHTML = marked.parse(content); wrapper.classList.add('markitect-section-editable'); wrapper.setAttribute('data-section', section.getAttribute('data-section')); wrapper.setAttribute('data-edited', 'true'); // Replace the section with the wrapper section.parentNode.replaceChild(wrapper, section); } // Re-mark sections in the entire document, but skip edited wrappers this.markSections(document.getElementById('markdown-content')); }); section.innerHTML = ''; section.appendChild(textarea); textarea.focus(); } htmlToMarkdown(html) { // Create a temporary element to parse the HTML const temp = document.createElement('div'); temp.innerHTML = html; // Better HTML to Markdown conversion that preserves structure let markdown = ''; const processNode = (node) => { if (node.nodeType === Node.TEXT_NODE) { return node.textContent; } if (node.nodeType === Node.ELEMENT_NODE) { const tagName = node.tagName.toLowerCase(); switch (tagName) { case 'h1': return '# ' + node.textContent; case 'h2': return '## ' + node.textContent; case 'h3': return '### ' + node.textContent; case 'h4': return '#### ' + node.textContent; case 'h5': return '##### ' + node.textContent; case 'h6': return '###### ' + node.textContent; case 'p': // Handle paragraphs with potential inline formatting const childText = Array.from(node.childNodes).map(processNode).join(''); return childText; case 'strong': case 'b': return '**' + node.textContent + '**'; case 'em': case 'i': return '*' + node.textContent + '*'; case 'code': return '`' + node.textContent + '`'; case 'pre': // Handle code blocks const codeContent = node.textContent; return '```\\n' + codeContent + '\\n```'; case 'blockquote': const quoteLines = node.textContent.split('\\n'); return quoteLines.map(line => '> ' + line).join('\\n'); case 'ul': // Handle unordered lists const ulItems = Array.from(node.querySelectorAll('li')); return ulItems.map(li => '- ' + li.textContent).join('\\n'); case 'ol': // Handle ordered lists const olItems = Array.from(node.querySelectorAll('li')); return olItems.map((li, index) => (index + 1) + '. ' + li.textContent).join('\\n'); case 'br': return '\\n'; default: return node.textContent; } } return ''; }; // Process each child node and add appropriate spacing const nodes = Array.from(temp.childNodes); nodes.forEach((node, index) => { const result = processNode(node); if (result.trim()) { if (index > 0 && markdown.trim()) { markdown += '\\n\\n'; } markdown += result; } }); return markdown.trim(); } setupKeyboardShortcuts() { if (MARKITECT_EDITOR_CONFIG.keyboardShortcuts) { document.addEventListener('keydown', (event) => { if (event.ctrlKey || event.metaKey) { switch(event.key) { case 's': event.preventDefault(); this.save(); break; case 'e': event.preventDefault(); this.togglePreview(); break; } } }); } } save() { try { // Get the current markdown content from the editor const markdownContent = this.getMarkdownContent(); // Create filename with timestamp suffix for backup convention const now = new Date(); const timestamp = now.toISOString().slice(0, 19).replace(/:/g, '-').replace('T', '-'); const originalFilename = window.location.pathname.split('/').pop().replace('.html', '.md'); const backupFilename = `${originalFilename.replace('.md', '')}-edited-${timestamp}.md`; // Create and download the file const blob = new Blob([markdownContent], { type: 'text/markdown' }); const url = URL.createObjectURL(blob); const a = document.createElement('a'); a.href = url; a.download = backupFilename; document.body.appendChild(a); a.click(); document.body.removeChild(a); URL.revokeObjectURL(url); // Update status with filename convention info const statusEl = document.getElementById('save-status'); statusEl.textContent = `Downloaded: ${backupFilename}`; statusEl.title = 'File saved with timestamp to avoid overwriting original'; setTimeout(() => { statusEl.textContent = 'Ready'; statusEl.title = ''; }, 5000); } catch (error) { document.getElementById('save-status').textContent = 'Save failed!'; console.error('Save error:', error); setTimeout(() => { document.getElementById('save-status').textContent = 'Ready'; }, 3000); } } getMarkdownContent() { // If no edits have been made, return the original markdown content if (!this.hasEdits) { return markdownContent; } // Reconstruct markdown content from the current state of sections const content = document.getElementById('markdown-content'); if (!content) { return markdownContent; // fallback to original } // Simple approach: get the text content and convert back to markdown // This is a basic implementation - could be enhanced for better preservation const sections = content.querySelectorAll('.markitect-section-editable'); let reconstructed = ''; sections.forEach(section => {{ // Handle edited wrappers differently if (section.hasAttribute('data-edited')) {{ // For edited sections, convert the child elements back to markdown const childElements = section.children; for (let i = 0; i < childElements.length; i++) {{ const child = childElements[i]; const tagName = child.tagName.toLowerCase(); const text = child.textContent.trim(); if (tagName.startsWith('h')) {{ const level = parseInt(tagName.charAt(1)); reconstructed += '#'.repeat(level) + ' ' + text + '\\n\\n'; }} else if (tagName === 'p') {{ reconstructed += text + '\\n\\n'; }} else if (tagName === 'blockquote') {{ reconstructed += '> ' + text + '\\n\\n'; }} else if (tagName === 'pre') {{ reconstructed += '```\\n' + text + '\\n```\\n\\n'; }} else if (tagName === 'ul') {{ const items = child.querySelectorAll('li'); items.forEach(item => {{ reconstructed += '- ' + item.textContent.trim() + '\\n'; }}); reconstructed += '\\n'; }} else if (tagName === 'ol') {{ const items = child.querySelectorAll('li'); items.forEach((item, index) => {{ reconstructed += (index + 1) + '. ' + item.textContent.trim() + '\\n'; }}); reconstructed += '\\n'; }} else {{ reconstructed += text + '\\n\\n'; }} }} }} else {{ // Handle regular sections const tagName = section.tagName.toLowerCase(); const text = section.textContent.trim(); if (tagName.startsWith('h')) {{ const level = parseInt(tagName.charAt(1)); reconstructed += '#'.repeat(level) + ' ' + text + '\\n\\n'; }} else if (tagName === 'p') {{ reconstructed += text + '\\n\\n'; }} else if (tagName === 'blockquote') {{ reconstructed += '> ' + text + '\\n\\n'; }} else if (tagName === 'pre') {{ reconstructed += '```\\n' + text + '\\n```\\n\\n'; }} else if (tagName === 'ul') {{ const items = section.querySelectorAll('li'); items.forEach(item => {{ reconstructed += '- ' + item.textContent.trim() + '\\n'; }}); reconstructed += '\\n'; }} else if (tagName === 'ol') {{ const items = section.querySelectorAll('li'); items.forEach((item, index) => {{ reconstructed += (index + 1) + '. ' + item.textContent.trim() + '\\n'; }}); reconstructed += '\\n'; }} else {{ reconstructed += text + '\\n\\n'; }} }} }}); return reconstructed.trim(); } togglePreview() { console.log('Toggle preview mode'); } } let markitectEditor; // Control panel toggle functionality function toggleControlPanel() { const panel = document.getElementById('markitect-control-panel'); if (panel) { panel.classList.toggle('expanded'); } } // Auto-close panel when clicking outside document.addEventListener('click', function(event) { const panel = document.getElementById('markitect-control-panel'); if (panel && panel.classList.contains('expanded')) { if (!panel.contains(event.target)) { panel.classList.remove('expanded'); } } });""" # Edit mode status and error reporting section edit_mode_html = "" if edit_mode: # Get version info for header try: import markitect from pathlib import Path import subprocess # Get base version version = "0.3.0" # fallback try: from importlib.metadata import version as get_version version = get_version('markitect') except: pass # Get git commit with timestamp and local changes info git_info = "" try: repo_path = Path(__file__).parent.parent # Get commit hash and timestamp result = subprocess.run(['git', 'rev-parse', '--short', 'HEAD'], capture_output=True, text=True, cwd=repo_path) if result.returncode == 0: commit_hash = result.stdout.strip() # Get commit timestamp timestamp_result = subprocess.run(['git', 'show', '-s', '--format=%ci', 'HEAD'], capture_output=True, text=True, cwd=repo_path) commit_time = "" if timestamp_result.returncode == 0: from datetime import datetime # Parse git timestamp and format it nicely git_time = timestamp_result.stdout.strip() try: dt = datetime.fromisoformat(git_time.replace(' +', '+')) commit_time = f" ({dt.strftime('%Y-%m-%d %H:%M')})" except: pass git_info = f"+{commit_hash}{commit_time}" # Check for uncommitted changes status_result = subprocess.run(['git', 'status', '--porcelain'], capture_output=True, text=True, cwd=repo_path) if status_result.returncode == 0 and status_result.stdout.strip(): # Get timestamp of most recent uncommitted change import os import glob latest_change = 0 for line in status_result.stdout.strip().split('\n'): if line.strip(): # Extract filename (skip first 3 chars which are status indicators) filename = line[3:].strip() try: file_path = repo_path / filename if file_path.exists(): mtime = os.path.getmtime(file_path) latest_change = max(latest_change, mtime) except: pass if latest_change > 0: change_dt = datetime.fromtimestamp(latest_change) git_info += f" including local changes until {change_dt.strftime('%Y-%m-%d %H:%M')}" except: pass version_info = f"{version}{git_info}" except: version_info = "0.3.0" edit_mode_html = f"""
v{version_info}