Added comprehensive full text search capabilities as a lightweight plugin. Key features: - SQLite FTS5-based search engine with no external dependencies - Automatic indexing via database triggers for real-time updates - Advanced query support: phrase search, boolean operators, proximity search - Complete CLI interface with search commands - Graceful fallback to LIKE queries when FTS5 unavailable - Plugin architecture integration for extensibility CLI Commands: - `markitect search init` - Initialize search indexes - `markitect search query` - Perform full text searches - `markitect search status` - View index statistics - `markitect search rebuild` - Rebuild indexes from scratch Search Features: - Content type filtering (files, schemas, all) - Result pagination and formatting options - Query validation and syntax assistance - Performance optimization and index maintenance Technical Implementation: - FTSSearchPlugin: Main search plugin class - SearchIndexer: FTS5 table management and indexing - QueryParser: Query optimization and FTS5 syntax conversion - Comprehensive error handling and fallback mechanisms - 25 test cases covering all functionality Documentation includes complete usage guide and examples. Resolves issue #83: Full text search 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
307 lines
11 KiB
Python
307 lines
11 KiB
Python
"""
|
|
SQLite FTS5 full text search plugin for MarkiTect.
|
|
|
|
Provides advanced full text search capabilities using SQLite's built-in
|
|
FTS5 virtual table extension for lightweight, high-performance search.
|
|
"""
|
|
|
|
import sqlite3
|
|
import json
|
|
from typing import Dict, Any, List, Optional, Tuple
|
|
from pathlib import Path
|
|
|
|
from ...base import BasePlugin, PluginMetadata, PluginType
|
|
from ...decorators import register_plugin
|
|
from .indexer import SearchIndexer
|
|
from .query_parser import QueryParser
|
|
|
|
|
|
@register_plugin("fts_search")
|
|
class FTSSearchPlugin(BasePlugin):
|
|
"""Full Text Search plugin using SQLite FTS5."""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.indexer = SearchIndexer()
|
|
self.query_parser = QueryParser()
|
|
|
|
@property
|
|
def metadata(self) -> PluginMetadata:
|
|
return PluginMetadata(
|
|
name="fts_search",
|
|
version="1.0.0",
|
|
description="Full text search using SQLite FTS5",
|
|
author="MarkiTect Team",
|
|
plugin_type=PluginType.EXTENSION
|
|
)
|
|
|
|
def initialize(self, db_path: str) -> None:
|
|
"""Initialize FTS5 search tables and indexes."""
|
|
self.db_path = db_path
|
|
self.indexer.initialize_fts_tables(db_path)
|
|
|
|
def rebuild_index(self, db_path: str) -> Dict[str, int]:
|
|
"""Rebuild the full text search index."""
|
|
return self.indexer.rebuild_index(db_path)
|
|
|
|
def search(self,
|
|
db_path: str,
|
|
query: str,
|
|
content_type: str = "all",
|
|
limit: int = 20,
|
|
offset: int = 0) -> List[Dict[str, Any]]:
|
|
"""
|
|
Perform full text search.
|
|
|
|
Args:
|
|
db_path: Path to SQLite database
|
|
query: Search query (supports FTS5 syntax)
|
|
content_type: Type of content to search ("all", "files", "schemas")
|
|
limit: Maximum number of results
|
|
offset: Result offset for pagination
|
|
|
|
Returns:
|
|
List of search results with relevance scores
|
|
"""
|
|
conn = sqlite3.connect(db_path)
|
|
conn.row_factory = sqlite3.Row
|
|
cursor = conn.cursor()
|
|
|
|
results = []
|
|
|
|
try:
|
|
# Parse and validate query
|
|
parsed_query = self.query_parser.parse_query(query)
|
|
|
|
if content_type in ["all", "files"]:
|
|
results.extend(self._search_files(cursor, parsed_query, limit, offset))
|
|
|
|
if content_type in ["all", "schemas"]:
|
|
results.extend(self._search_schemas(cursor, parsed_query, limit, offset))
|
|
|
|
# Sort by relevance score and apply global limit
|
|
results.sort(key=lambda x: x.get('score', 0), reverse=True)
|
|
|
|
if content_type == "all":
|
|
results = results[:limit]
|
|
|
|
except Exception as e:
|
|
# Fall back to simple LIKE search if FTS5 fails
|
|
results = self._fallback_search(cursor, query, content_type, limit, offset)
|
|
|
|
finally:
|
|
conn.close()
|
|
|
|
return results
|
|
|
|
def _search_files(self, cursor: sqlite3.Cursor, query: str, limit: int, offset: int) -> List[Dict[str, Any]]:
|
|
"""Search in markdown files using FTS5."""
|
|
cursor.execute("""
|
|
SELECT
|
|
mf.id, mf.filename, mf.content, mf.front_matter, mf.created_at,
|
|
fts.rank, bm25(fts_files) as score,
|
|
snippet(fts_files, 1, '<mark>', '</mark>', '...', 32) as highlight
|
|
FROM fts_files fts
|
|
JOIN markdown_files mf ON mf.id = fts.rowid
|
|
WHERE fts_files MATCH ?
|
|
ORDER BY score DESC
|
|
LIMIT ? OFFSET ?
|
|
""", (query, limit, offset))
|
|
|
|
results = []
|
|
for row in cursor.fetchall():
|
|
# Parse front matter
|
|
front_matter_raw = {}
|
|
if row['front_matter']:
|
|
try:
|
|
front_matter_raw = json.loads(row['front_matter'])
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
results.append({
|
|
'type': 'file',
|
|
'score': abs(row['score']) if row['score'] else 1.0,
|
|
'file': {
|
|
'id': row['id'],
|
|
'filename': row['filename'],
|
|
'content': row['content'],
|
|
'front_matter_raw': front_matter_raw,
|
|
'created_at': row['created_at']
|
|
},
|
|
'highlight': row['highlight']
|
|
})
|
|
|
|
return results
|
|
|
|
def _search_schemas(self, cursor: sqlite3.Cursor, query: str, limit: int, offset: int) -> List[Dict[str, Any]]:
|
|
"""Search in schemas using FTS5."""
|
|
cursor.execute("""
|
|
SELECT
|
|
s.id, s.filename, s.title, s.description, s.schema_content,
|
|
s.created_at, s.updated_at,
|
|
fts.rank, bm25(fts_schemas) as score,
|
|
snippet(fts_schemas, 1, '<mark>', '</mark>', '...', 32) as highlight
|
|
FROM fts_schemas fts
|
|
JOIN schemas s ON s.id = fts.rowid
|
|
WHERE fts_schemas MATCH ?
|
|
ORDER BY score DESC
|
|
LIMIT ? OFFSET ?
|
|
""", (query, limit, offset))
|
|
|
|
results = []
|
|
for row in cursor.fetchall():
|
|
# Parse schema content
|
|
schema_content = {}
|
|
if row['schema_content']:
|
|
try:
|
|
schema_content = json.loads(row['schema_content'])
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
results.append({
|
|
'type': 'schema',
|
|
'score': abs(row['score']) if row['score'] else 1.0,
|
|
'schema': {
|
|
'id': row['id'],
|
|
'filename': row['filename'],
|
|
'title': row['title'],
|
|
'description': row['description'],
|
|
'schema_content': schema_content,
|
|
'created_at': row['created_at'],
|
|
'updated_at': row['updated_at']
|
|
},
|
|
'highlight': row['highlight']
|
|
})
|
|
|
|
return results
|
|
|
|
def _fallback_search(self, cursor: sqlite3.Cursor, query: str, content_type: str, limit: int, offset: int) -> List[Dict[str, Any]]:
|
|
"""Fallback to simple LIKE search if FTS5 fails."""
|
|
results = []
|
|
|
|
if content_type in ["all", "files"]:
|
|
cursor.execute("""
|
|
SELECT id, filename, content, front_matter, created_at
|
|
FROM markdown_files
|
|
WHERE filename LIKE ? OR content LIKE ?
|
|
ORDER BY
|
|
CASE WHEN filename LIKE ? THEN 1 ELSE 2 END,
|
|
created_at DESC
|
|
LIMIT ? OFFSET ?
|
|
""", (f"%{query}%", f"%{query}%", f"%{query}%", limit, offset))
|
|
|
|
for row in cursor.fetchall():
|
|
front_matter_raw = {}
|
|
if row['front_matter']:
|
|
try:
|
|
front_matter_raw = json.loads(row['front_matter'])
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
results.append({
|
|
'type': 'file',
|
|
'score': 1.0,
|
|
'file': {
|
|
'id': row['id'],
|
|
'filename': row['filename'],
|
|
'content': row['content'],
|
|
'front_matter_raw': front_matter_raw,
|
|
'created_at': row['created_at']
|
|
},
|
|
'highlight': self._extract_highlight(row['content'] or '', query)
|
|
})
|
|
|
|
if content_type in ["all", "schemas"]:
|
|
cursor.execute("""
|
|
SELECT id, filename, title, description, schema_content, created_at, updated_at
|
|
FROM schemas
|
|
WHERE filename LIKE ? OR title LIKE ? OR description LIKE ?
|
|
ORDER BY created_at DESC
|
|
LIMIT ? OFFSET ?
|
|
""", (f"%{query}%", f"%{query}%", f"%{query}%", limit, offset))
|
|
|
|
for row in cursor.fetchall():
|
|
schema_content = {}
|
|
if row['schema_content']:
|
|
try:
|
|
schema_content = json.loads(row['schema_content'])
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
results.append({
|
|
'type': 'schema',
|
|
'score': 1.0,
|
|
'schema': {
|
|
'id': row['id'],
|
|
'filename': row['filename'],
|
|
'title': row['title'],
|
|
'description': row['description'],
|
|
'schema_content': schema_content,
|
|
'created_at': row['created_at'],
|
|
'updated_at': row['updated_at']
|
|
},
|
|
'highlight': self._extract_highlight(row['description'] or '', query)
|
|
})
|
|
|
|
return results
|
|
|
|
def _extract_highlight(self, text: str, query: str, max_length: int = 100) -> str:
|
|
"""Extract highlighted snippet from text."""
|
|
if not text or not query:
|
|
return ""
|
|
|
|
query_lower = query.lower()
|
|
text_lower = text.lower()
|
|
|
|
# Find the first occurrence
|
|
start = text_lower.find(query_lower)
|
|
if start == -1:
|
|
return text[:max_length] + "..." if len(text) > max_length else text
|
|
|
|
# Calculate snippet boundaries
|
|
snippet_start = max(0, start - max_length // 4)
|
|
snippet_end = min(len(text), start + len(query) + max_length // 2)
|
|
|
|
snippet = text[snippet_start:snippet_end]
|
|
|
|
# Add ellipsis if truncated
|
|
if snippet_start > 0:
|
|
snippet = "..." + snippet
|
|
if snippet_end < len(text):
|
|
snippet = snippet + "..."
|
|
|
|
return snippet
|
|
|
|
def get_search_stats(self, db_path: str) -> Dict[str, Any]:
|
|
"""Get search index statistics."""
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
|
|
stats = {}
|
|
|
|
try:
|
|
# Check if FTS tables exist
|
|
cursor.execute("""
|
|
SELECT name FROM sqlite_master
|
|
WHERE type='table' AND name LIKE 'fts_%'
|
|
""")
|
|
fts_tables = [row[0] for row in cursor.fetchall()]
|
|
|
|
stats['fts_enabled'] = len(fts_tables) > 0
|
|
stats['fts_tables'] = fts_tables
|
|
|
|
if stats['fts_enabled']:
|
|
# Get index statistics
|
|
for table in fts_tables:
|
|
cursor.execute(f"SELECT COUNT(*) FROM {table}")
|
|
count = cursor.fetchone()[0]
|
|
stats[f'{table}_documents'] = count
|
|
|
|
except sqlite3.Error:
|
|
stats['fts_enabled'] = False
|
|
stats['error'] = "FTS tables not available"
|
|
|
|
finally:
|
|
conn.close()
|
|
|
|
return stats |