Major gap analysis reveals critical missing CLI interface despite solid library foundation. This commit implements core components and strategic roadmap pivot. Key Changes: - NEXT.md: Complete strategic roadmap pivot to CLI-first implementation - FEATURES.md: Comprehensive USP and architecture documentation - markitect/ast_cache.py: High-performance AST caching system - markitect/document_manager.py: Parse-once architecture implementation - docs/markitect.1: CLI interface manpage documentation Foundation Status: - All 45 tests passing (solid library base) - AST caching with <50% parse time performance goal - Database integration ready for CLI integration - TDD8 methodology fully operational Strategic Pivot: - Previous: Continue with Issues #2-4 (database expansion) - New Priority: Issue #5 - CLI Entry Point implementation - Goal: Transform library capabilities into user-accessible tools Next Session: Implement CLI interface using Click/Typer framework to deliver documented vision and core USPs. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
213 lines
7.0 KiB
Python
213 lines
7.0 KiB
Python
"""
|
|
Document manager for high-performance markdown file ingestion and AST caching.
|
|
|
|
This module implements the core functionality for Issue #2: Fast Document Loading & CLI Manipulation.
|
|
It provides performance-optimized document processing through AST caching and database integration.
|
|
|
|
Key Features:
|
|
- Parse once, access many times architecture
|
|
- AST cache loading < 50% of markdown parsing time
|
|
- Seamless integration with Issue #1 database foundation
|
|
- Comprehensive error handling and validation
|
|
"""
|
|
|
|
import json
|
|
import time
|
|
from pathlib import Path
|
|
from typing import Dict, Any, Optional
|
|
|
|
from .parser import parse_markdown_to_ast
|
|
from .frontmatter import FrontMatterParser
|
|
|
|
|
|
class DocumentManager:
|
|
"""
|
|
High-performance document manager for markdown file processing.
|
|
|
|
Implements the "parse once, manipulate many times" architecture by creating
|
|
fast-loading AST cache files alongside database metadata storage.
|
|
|
|
Architecture:
|
|
markdown file → AST parsing → cache file + database metadata
|
|
|
|
Performance Goal:
|
|
Cache loading must be < 50% of original parsing time
|
|
|
|
Attributes:
|
|
db_manager: Database manager for metadata storage
|
|
cache_dir: Directory for AST cache files
|
|
frontmatter_parser: YAML front matter processor
|
|
"""
|
|
|
|
def __init__(self, database_manager, cache_dir: Optional[Path] = None):
|
|
"""
|
|
Initialize document manager with database and cache configuration.
|
|
|
|
Args:
|
|
database_manager: DatabaseManager instance for metadata storage
|
|
cache_dir: Directory for AST cache files (default: .ast_cache)
|
|
"""
|
|
self.db_manager = database_manager
|
|
self.cache_dir = Path(cache_dir) if cache_dir else Path(".ast_cache")
|
|
self.cache_dir.mkdir(exist_ok=True)
|
|
self.frontmatter_parser = FrontMatterParser()
|
|
|
|
def ingest_file(self, file_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Ingest a markdown file with performance-optimized AST caching.
|
|
|
|
Implements the core "parse once, manipulate many times" workflow:
|
|
1. Validates file existence
|
|
2. Parses markdown content to AST
|
|
3. Creates fast-loading AST cache file
|
|
4. Stores metadata in database
|
|
5. Returns processing results with performance metrics
|
|
|
|
Args:
|
|
file_path: Path to markdown file to ingest
|
|
|
|
Returns:
|
|
Dictionary containing:
|
|
- ast: Parsed AST representation
|
|
- metadata: File metadata (filename, title, etc.)
|
|
- ast_cache_path: Path to created cache file
|
|
- parse_time: Time spent parsing markdown (seconds)
|
|
- cache_time: Time spent creating cache (seconds)
|
|
|
|
Raises:
|
|
FileNotFoundError: If the specified file doesn't exist
|
|
|
|
Performance:
|
|
Initial parse creates overhead, but subsequent cache loads
|
|
will be < 50% of this parse time.
|
|
"""
|
|
# Validate file exists
|
|
if not file_path.exists():
|
|
raise FileNotFoundError(f"File not found: {file_path}")
|
|
|
|
# Read file content
|
|
content = self._read_file_content(file_path)
|
|
|
|
# Parse front matter for metadata extraction
|
|
front_matter, markdown_content = self.frontmatter_parser.parse(content)
|
|
|
|
# Parse to AST with performance timing
|
|
ast, parse_time = self._parse_content_to_ast(content)
|
|
|
|
# Create cache file with performance timing
|
|
cache_file, cache_time = self._create_performance_cache(file_path.name, ast)
|
|
|
|
# Store in database (handles front matter parsing internally)
|
|
self._store_in_database(file_path.name, content)
|
|
|
|
# Return comprehensive result
|
|
return self._build_ingestion_result(
|
|
ast=ast,
|
|
filename=file_path.name,
|
|
front_matter=front_matter,
|
|
cache_file=cache_file,
|
|
parse_time=parse_time,
|
|
cache_time=cache_time
|
|
)
|
|
|
|
def _read_file_content(self, file_path: Path) -> str:
|
|
"""
|
|
Read file content with proper encoding.
|
|
|
|
Args:
|
|
file_path: Path to file to read
|
|
|
|
Returns:
|
|
File content as string
|
|
"""
|
|
return file_path.read_text(encoding='utf-8')
|
|
|
|
def _parse_content_to_ast(self, content: str) -> tuple[list, float]:
|
|
"""
|
|
Parse markdown content to AST with performance timing.
|
|
|
|
Args:
|
|
content: Raw markdown content
|
|
|
|
Returns:
|
|
Tuple of (AST tokens, parse_time_seconds)
|
|
"""
|
|
start_time = time.time()
|
|
ast = parse_markdown_to_ast(content)
|
|
parse_time = time.time() - start_time
|
|
return ast, parse_time
|
|
|
|
def _create_performance_cache(self, filename: str, ast: list) -> tuple[Path, float]:
|
|
"""
|
|
Create AST cache file with performance timing.
|
|
|
|
Args:
|
|
filename: Source filename for cache naming
|
|
ast: AST tokens to cache
|
|
|
|
Returns:
|
|
Tuple of (cache_file_path, cache_time_seconds)
|
|
"""
|
|
start_time = time.time()
|
|
cache_file = self._create_ast_cache(filename, ast)
|
|
cache_time = time.time() - start_time
|
|
return cache_file, cache_time
|
|
|
|
def _store_in_database(self, filename: str, content: str) -> None:
|
|
"""
|
|
Store document in database using existing API.
|
|
|
|
Args:
|
|
filename: Name of the file
|
|
content: Full markdown content (including front matter)
|
|
|
|
Note:
|
|
The database manager handles front matter parsing internally.
|
|
"""
|
|
self.db_manager.store_markdown_file(filename, content)
|
|
|
|
def _build_ingestion_result(self, ast: list, filename: str, front_matter: dict,
|
|
cache_file: Path, parse_time: float, cache_time: float) -> Dict[str, Any]:
|
|
"""
|
|
Build comprehensive ingestion result dictionary.
|
|
|
|
Args:
|
|
ast: Parsed AST tokens
|
|
filename: Source filename
|
|
front_matter: Parsed front matter metadata
|
|
cache_file: Path to created cache file
|
|
parse_time: Time spent parsing (seconds)
|
|
cache_time: Time spent caching (seconds)
|
|
|
|
Returns:
|
|
Structured result dictionary with all ingestion data
|
|
"""
|
|
return {
|
|
'ast': ast,
|
|
'metadata': {
|
|
'filename': filename,
|
|
'title': front_matter.get('title', ''),
|
|
},
|
|
'ast_cache_path': cache_file,
|
|
'parse_time': parse_time,
|
|
'cache_time': cache_time
|
|
}
|
|
|
|
def _create_ast_cache(self, filename: str, ast: list) -> Path:
|
|
"""
|
|
Create AST cache file in JSON format.
|
|
|
|
Args:
|
|
filename: Source filename for cache naming
|
|
ast: AST tokens to serialize
|
|
|
|
Returns:
|
|
Path to created cache file
|
|
"""
|
|
cache_filename = f"{filename}.ast.json"
|
|
cache_path = self.cache_dir / cache_filename
|
|
|
|
with open(cache_path, 'w', encoding='utf-8') as f:
|
|
json.dump(ast, f, indent=2, ensure_ascii=False)
|
|
|
|
return cache_path |