feat: Strategic pivot to CLI implementation with comprehensive foundation
Major gap analysis reveals critical missing CLI interface despite solid library foundation. This commit implements core components and strategic roadmap pivot. Key Changes: - NEXT.md: Complete strategic roadmap pivot to CLI-first implementation - FEATURES.md: Comprehensive USP and architecture documentation - markitect/ast_cache.py: High-performance AST caching system - markitect/document_manager.py: Parse-once architecture implementation - docs/markitect.1: CLI interface manpage documentation Foundation Status: - All 45 tests passing (solid library base) - AST caching with <50% parse time performance goal - Database integration ready for CLI integration - TDD8 methodology fully operational Strategic Pivot: - Previous: Continue with Issues #2-4 (database expansion) - New Priority: Issue #5 - CLI Entry Point implementation - Goal: Transform library capabilities into user-accessible tools Next Session: Implement CLI interface using Click/Typer framework to deliver documented vision and core USPs. 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
213
markitect/document_manager.py
Normal file
213
markitect/document_manager.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Document manager for high-performance markdown file ingestion and AST caching.
|
||||
|
||||
This module implements the core functionality for Issue #2: Fast Document Loading & CLI Manipulation.
|
||||
It provides performance-optimized document processing through AST caching and database integration.
|
||||
|
||||
Key Features:
|
||||
- Parse once, access many times architecture
|
||||
- AST cache loading < 50% of markdown parsing time
|
||||
- Seamless integration with Issue #1 database foundation
|
||||
- Comprehensive error handling and validation
|
||||
"""
|
||||
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
from .parser import parse_markdown_to_ast
|
||||
from .frontmatter import FrontMatterParser
|
||||
|
||||
|
||||
class DocumentManager:
|
||||
"""
|
||||
High-performance document manager for markdown file processing.
|
||||
|
||||
Implements the "parse once, manipulate many times" architecture by creating
|
||||
fast-loading AST cache files alongside database metadata storage.
|
||||
|
||||
Architecture:
|
||||
markdown file → AST parsing → cache file + database metadata
|
||||
|
||||
Performance Goal:
|
||||
Cache loading must be < 50% of original parsing time
|
||||
|
||||
Attributes:
|
||||
db_manager: Database manager for metadata storage
|
||||
cache_dir: Directory for AST cache files
|
||||
frontmatter_parser: YAML front matter processor
|
||||
"""
|
||||
|
||||
def __init__(self, database_manager, cache_dir: Optional[Path] = None):
|
||||
"""
|
||||
Initialize document manager with database and cache configuration.
|
||||
|
||||
Args:
|
||||
database_manager: DatabaseManager instance for metadata storage
|
||||
cache_dir: Directory for AST cache files (default: .ast_cache)
|
||||
"""
|
||||
self.db_manager = database_manager
|
||||
self.cache_dir = Path(cache_dir) if cache_dir else Path(".ast_cache")
|
||||
self.cache_dir.mkdir(exist_ok=True)
|
||||
self.frontmatter_parser = FrontMatterParser()
|
||||
|
||||
def ingest_file(self, file_path: Path) -> Dict[str, Any]:
|
||||
"""
|
||||
Ingest a markdown file with performance-optimized AST caching.
|
||||
|
||||
Implements the core "parse once, manipulate many times" workflow:
|
||||
1. Validates file existence
|
||||
2. Parses markdown content to AST
|
||||
3. Creates fast-loading AST cache file
|
||||
4. Stores metadata in database
|
||||
5. Returns processing results with performance metrics
|
||||
|
||||
Args:
|
||||
file_path: Path to markdown file to ingest
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- ast: Parsed AST representation
|
||||
- metadata: File metadata (filename, title, etc.)
|
||||
- ast_cache_path: Path to created cache file
|
||||
- parse_time: Time spent parsing markdown (seconds)
|
||||
- cache_time: Time spent creating cache (seconds)
|
||||
|
||||
Raises:
|
||||
FileNotFoundError: If the specified file doesn't exist
|
||||
|
||||
Performance:
|
||||
Initial parse creates overhead, but subsequent cache loads
|
||||
will be < 50% of this parse time.
|
||||
"""
|
||||
# Validate file exists
|
||||
if not file_path.exists():
|
||||
raise FileNotFoundError(f"File not found: {file_path}")
|
||||
|
||||
# Read file content
|
||||
content = self._read_file_content(file_path)
|
||||
|
||||
# Parse front matter for metadata extraction
|
||||
front_matter, markdown_content = self.frontmatter_parser.parse(content)
|
||||
|
||||
# Parse to AST with performance timing
|
||||
ast, parse_time = self._parse_content_to_ast(content)
|
||||
|
||||
# Create cache file with performance timing
|
||||
cache_file, cache_time = self._create_performance_cache(file_path.name, ast)
|
||||
|
||||
# Store in database (handles front matter parsing internally)
|
||||
self._store_in_database(file_path.name, content)
|
||||
|
||||
# Return comprehensive result
|
||||
return self._build_ingestion_result(
|
||||
ast=ast,
|
||||
filename=file_path.name,
|
||||
front_matter=front_matter,
|
||||
cache_file=cache_file,
|
||||
parse_time=parse_time,
|
||||
cache_time=cache_time
|
||||
)
|
||||
|
||||
def _read_file_content(self, file_path: Path) -> str:
|
||||
"""
|
||||
Read file content with proper encoding.
|
||||
|
||||
Args:
|
||||
file_path: Path to file to read
|
||||
|
||||
Returns:
|
||||
File content as string
|
||||
"""
|
||||
return file_path.read_text(encoding='utf-8')
|
||||
|
||||
def _parse_content_to_ast(self, content: str) -> tuple[list, float]:
|
||||
"""
|
||||
Parse markdown content to AST with performance timing.
|
||||
|
||||
Args:
|
||||
content: Raw markdown content
|
||||
|
||||
Returns:
|
||||
Tuple of (AST tokens, parse_time_seconds)
|
||||
"""
|
||||
start_time = time.time()
|
||||
ast = parse_markdown_to_ast(content)
|
||||
parse_time = time.time() - start_time
|
||||
return ast, parse_time
|
||||
|
||||
def _create_performance_cache(self, filename: str, ast: list) -> tuple[Path, float]:
|
||||
"""
|
||||
Create AST cache file with performance timing.
|
||||
|
||||
Args:
|
||||
filename: Source filename for cache naming
|
||||
ast: AST tokens to cache
|
||||
|
||||
Returns:
|
||||
Tuple of (cache_file_path, cache_time_seconds)
|
||||
"""
|
||||
start_time = time.time()
|
||||
cache_file = self._create_ast_cache(filename, ast)
|
||||
cache_time = time.time() - start_time
|
||||
return cache_file, cache_time
|
||||
|
||||
def _store_in_database(self, filename: str, content: str) -> None:
|
||||
"""
|
||||
Store document in database using existing API.
|
||||
|
||||
Args:
|
||||
filename: Name of the file
|
||||
content: Full markdown content (including front matter)
|
||||
|
||||
Note:
|
||||
The database manager handles front matter parsing internally.
|
||||
"""
|
||||
self.db_manager.store_markdown_file(filename, content)
|
||||
|
||||
def _build_ingestion_result(self, ast: list, filename: str, front_matter: dict,
|
||||
cache_file: Path, parse_time: float, cache_time: float) -> Dict[str, Any]:
|
||||
"""
|
||||
Build comprehensive ingestion result dictionary.
|
||||
|
||||
Args:
|
||||
ast: Parsed AST tokens
|
||||
filename: Source filename
|
||||
front_matter: Parsed front matter metadata
|
||||
cache_file: Path to created cache file
|
||||
parse_time: Time spent parsing (seconds)
|
||||
cache_time: Time spent caching (seconds)
|
||||
|
||||
Returns:
|
||||
Structured result dictionary with all ingestion data
|
||||
"""
|
||||
return {
|
||||
'ast': ast,
|
||||
'metadata': {
|
||||
'filename': filename,
|
||||
'title': front_matter.get('title', ''),
|
||||
},
|
||||
'ast_cache_path': cache_file,
|
||||
'parse_time': parse_time,
|
||||
'cache_time': cache_time
|
||||
}
|
||||
|
||||
def _create_ast_cache(self, filename: str, ast: list) -> Path:
|
||||
"""
|
||||
Create AST cache file in JSON format.
|
||||
|
||||
Args:
|
||||
filename: Source filename for cache naming
|
||||
ast: AST tokens to serialize
|
||||
|
||||
Returns:
|
||||
Path to created cache file
|
||||
"""
|
||||
cache_filename = f"{filename}.ast.json"
|
||||
cache_path = self.cache_dir / cache_filename
|
||||
|
||||
with open(cache_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(ast, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return cache_path
|
||||
Reference in New Issue
Block a user