""" SQLite FTS5 full text search plugin for MarkiTect. Provides advanced full text search capabilities using SQLite's built-in FTS5 virtual table extension for lightweight, high-performance search. """ import sqlite3 import json from typing import Dict, Any, List, Optional, Tuple from pathlib import Path from ...base import BasePlugin, PluginMetadata, PluginType from ...decorators import register_plugin from .indexer import SearchIndexer from .query_parser import QueryParser @register_plugin("fts_search") class FTSSearchPlugin(BasePlugin): """Full Text Search plugin using SQLite FTS5.""" def __init__(self): super().__init__() self.indexer = SearchIndexer() self.query_parser = QueryParser() @property def metadata(self) -> PluginMetadata: return PluginMetadata( name="fts_search", version="1.0.0", description="Full text search using SQLite FTS5", author="MarkiTect Team", plugin_type=PluginType.EXTENSION ) def initialize(self, db_path: str) -> None: """Initialize FTS5 search tables and indexes.""" self.db_path = db_path self.indexer.initialize_fts_tables(db_path) def rebuild_index(self, db_path: str) -> Dict[str, int]: """Rebuild the full text search index.""" return self.indexer.rebuild_index(db_path) def search(self, db_path: str, query: str, content_type: str = "all", limit: int = 20, offset: int = 0) -> List[Dict[str, Any]]: """ Perform full text search. Args: db_path: Path to SQLite database query: Search query (supports FTS5 syntax) content_type: Type of content to search ("all", "files", "schemas") limit: Maximum number of results offset: Result offset for pagination Returns: List of search results with relevance scores """ conn = sqlite3.connect(db_path) conn.row_factory = sqlite3.Row cursor = conn.cursor() results = [] try: # Parse and validate query parsed_query = self.query_parser.parse_query(query) if content_type in ["all", "files"]: results.extend(self._search_files(cursor, parsed_query, limit, offset)) if content_type in ["all", "schemas"]: results.extend(self._search_schemas(cursor, parsed_query, limit, offset)) # Sort by relevance score and apply global limit results.sort(key=lambda x: x.get('score', 0), reverse=True) if content_type == "all": results = results[:limit] except Exception as e: # Fall back to simple LIKE search if FTS5 fails results = self._fallback_search(cursor, query, content_type, limit, offset) finally: conn.close() return results def _search_files(self, cursor: sqlite3.Cursor, query: str, limit: int, offset: int) -> List[Dict[str, Any]]: """Search in markdown files using FTS5.""" cursor.execute(""" SELECT mf.id, mf.filename, mf.content, mf.front_matter, mf.created_at, fts.rank, bm25(fts_files) as score, snippet(fts_files, 1, '', '', '...', 32) as highlight FROM fts_files fts JOIN markdown_files mf ON mf.id = fts.rowid WHERE fts_files MATCH ? ORDER BY score DESC LIMIT ? OFFSET ? """, (query, limit, offset)) results = [] for row in cursor.fetchall(): # Parse front matter front_matter_raw = {} if row['front_matter']: try: front_matter_raw = json.loads(row['front_matter']) except json.JSONDecodeError: pass results.append({ 'type': 'file', 'score': abs(row['score']) if row['score'] else 1.0, 'file': { 'id': row['id'], 'filename': row['filename'], 'content': row['content'], 'front_matter_raw': front_matter_raw, 'created_at': row['created_at'] }, 'highlight': row['highlight'] }) return results def _search_schemas(self, cursor: sqlite3.Cursor, query: str, limit: int, offset: int) -> List[Dict[str, Any]]: """Search in schemas using FTS5.""" cursor.execute(""" SELECT s.id, s.filename, s.title, s.description, s.schema_content, s.created_at, s.updated_at, fts.rank, bm25(fts_schemas) as score, snippet(fts_schemas, 1, '', '', '...', 32) as highlight FROM fts_schemas fts JOIN schemas s ON s.id = fts.rowid WHERE fts_schemas MATCH ? ORDER BY score DESC LIMIT ? OFFSET ? """, (query, limit, offset)) results = [] for row in cursor.fetchall(): # Parse schema content schema_content = {} if row['schema_content']: try: schema_content = json.loads(row['schema_content']) except json.JSONDecodeError: pass results.append({ 'type': 'schema', 'score': abs(row['score']) if row['score'] else 1.0, 'schema': { 'id': row['id'], 'filename': row['filename'], 'title': row['title'], 'description': row['description'], 'schema_content': schema_content, 'created_at': row['created_at'], 'updated_at': row['updated_at'] }, 'highlight': row['highlight'] }) return results def _fallback_search(self, cursor: sqlite3.Cursor, query: str, content_type: str, limit: int, offset: int) -> List[Dict[str, Any]]: """Fallback to simple LIKE search if FTS5 fails.""" results = [] if content_type in ["all", "files"]: cursor.execute(""" SELECT id, filename, content, front_matter, created_at FROM markdown_files WHERE filename LIKE ? OR content LIKE ? ORDER BY CASE WHEN filename LIKE ? THEN 1 ELSE 2 END, created_at DESC LIMIT ? OFFSET ? """, (f"%{query}%", f"%{query}%", f"%{query}%", limit, offset)) for row in cursor.fetchall(): front_matter_raw = {} if row['front_matter']: try: front_matter_raw = json.loads(row['front_matter']) except json.JSONDecodeError: pass results.append({ 'type': 'file', 'score': 1.0, 'file': { 'id': row['id'], 'filename': row['filename'], 'content': row['content'], 'front_matter_raw': front_matter_raw, 'created_at': row['created_at'] }, 'highlight': self._extract_highlight(row['content'] or '', query) }) if content_type in ["all", "schemas"]: cursor.execute(""" SELECT id, filename, title, description, schema_content, created_at, updated_at FROM schemas WHERE filename LIKE ? OR title LIKE ? OR description LIKE ? ORDER BY created_at DESC LIMIT ? OFFSET ? """, (f"%{query}%", f"%{query}%", f"%{query}%", limit, offset)) for row in cursor.fetchall(): schema_content = {} if row['schema_content']: try: schema_content = json.loads(row['schema_content']) except json.JSONDecodeError: pass results.append({ 'type': 'schema', 'score': 1.0, 'schema': { 'id': row['id'], 'filename': row['filename'], 'title': row['title'], 'description': row['description'], 'schema_content': schema_content, 'created_at': row['created_at'], 'updated_at': row['updated_at'] }, 'highlight': self._extract_highlight(row['description'] or '', query) }) return results def _extract_highlight(self, text: str, query: str, max_length: int = 100) -> str: """Extract highlighted snippet from text.""" if not text or not query: return "" query_lower = query.lower() text_lower = text.lower() # Find the first occurrence start = text_lower.find(query_lower) if start == -1: return text[:max_length] + "..." if len(text) > max_length else text # Calculate snippet boundaries snippet_start = max(0, start - max_length // 4) snippet_end = min(len(text), start + len(query) + max_length // 2) snippet = text[snippet_start:snippet_end] # Add ellipsis if truncated if snippet_start > 0: snippet = "..." + snippet if snippet_end < len(text): snippet = snippet + "..." return snippet def get_search_stats(self, db_path: str) -> Dict[str, Any]: """Get search index statistics.""" conn = sqlite3.connect(db_path) cursor = conn.cursor() stats = {} try: # Check if FTS tables exist cursor.execute(""" SELECT name FROM sqlite_master WHERE type='table' AND name LIKE 'fts_%' """) fts_tables = [row[0] for row in cursor.fetchall()] stats['fts_enabled'] = len(fts_tables) > 0 stats['fts_tables'] = fts_tables if stats['fts_enabled']: # Get index statistics for table in fts_tables: cursor.execute(f"SELECT COUNT(*) FROM {table}") count = cursor.fetchone()[0] stats[f'{table}_documents'] = count except sqlite3.Error: stats['fts_enabled'] = False stats['error'] = "FTS tables not available" finally: conn.close() return stats