Files
markitect-main/markitect/plugins/builtin/search/fts_search.py
tegwick 8179929a4a feat: implement lightweight full text search plugin using SQLite FTS5 (issue #83)
Added comprehensive full text search capabilities as a lightweight plugin.

Key features:
- SQLite FTS5-based search engine with no external dependencies
- Automatic indexing via database triggers for real-time updates
- Advanced query support: phrase search, boolean operators, proximity search
- Complete CLI interface with search commands
- Graceful fallback to LIKE queries when FTS5 unavailable
- Plugin architecture integration for extensibility

CLI Commands:
- `markitect search init` - Initialize search indexes
- `markitect search query` - Perform full text searches
- `markitect search status` - View index statistics
- `markitect search rebuild` - Rebuild indexes from scratch

Search Features:
- Content type filtering (files, schemas, all)
- Result pagination and formatting options
- Query validation and syntax assistance
- Performance optimization and index maintenance

Technical Implementation:
- FTSSearchPlugin: Main search plugin class
- SearchIndexer: FTS5 table management and indexing
- QueryParser: Query optimization and FTS5 syntax conversion
- Comprehensive error handling and fallback mechanisms
- 25 test cases covering all functionality

Documentation includes complete usage guide and examples.

Resolves issue #83: Full text search

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-03 17:03:11 +02:00

307 lines
11 KiB
Python

"""
SQLite FTS5 full text search plugin for MarkiTect.
Provides advanced full text search capabilities using SQLite's built-in
FTS5 virtual table extension for lightweight, high-performance search.
"""
import sqlite3
import json
from typing import Dict, Any, List, Optional, Tuple
from pathlib import Path
from ...base import BasePlugin, PluginMetadata, PluginType
from ...decorators import register_plugin
from .indexer import SearchIndexer
from .query_parser import QueryParser
@register_plugin("fts_search")
class FTSSearchPlugin(BasePlugin):
"""Full Text Search plugin using SQLite FTS5."""
def __init__(self):
super().__init__()
self.indexer = SearchIndexer()
self.query_parser = QueryParser()
@property
def metadata(self) -> PluginMetadata:
return PluginMetadata(
name="fts_search",
version="1.0.0",
description="Full text search using SQLite FTS5",
author="MarkiTect Team",
plugin_type=PluginType.EXTENSION
)
def initialize(self, db_path: str) -> None:
"""Initialize FTS5 search tables and indexes."""
self.db_path = db_path
self.indexer.initialize_fts_tables(db_path)
def rebuild_index(self, db_path: str) -> Dict[str, int]:
"""Rebuild the full text search index."""
return self.indexer.rebuild_index(db_path)
def search(self,
db_path: str,
query: str,
content_type: str = "all",
limit: int = 20,
offset: int = 0) -> List[Dict[str, Any]]:
"""
Perform full text search.
Args:
db_path: Path to SQLite database
query: Search query (supports FTS5 syntax)
content_type: Type of content to search ("all", "files", "schemas")
limit: Maximum number of results
offset: Result offset for pagination
Returns:
List of search results with relevance scores
"""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
results = []
try:
# Parse and validate query
parsed_query = self.query_parser.parse_query(query)
if content_type in ["all", "files"]:
results.extend(self._search_files(cursor, parsed_query, limit, offset))
if content_type in ["all", "schemas"]:
results.extend(self._search_schemas(cursor, parsed_query, limit, offset))
# Sort by relevance score and apply global limit
results.sort(key=lambda x: x.get('score', 0), reverse=True)
if content_type == "all":
results = results[:limit]
except Exception as e:
# Fall back to simple LIKE search if FTS5 fails
results = self._fallback_search(cursor, query, content_type, limit, offset)
finally:
conn.close()
return results
def _search_files(self, cursor: sqlite3.Cursor, query: str, limit: int, offset: int) -> List[Dict[str, Any]]:
"""Search in markdown files using FTS5."""
cursor.execute("""
SELECT
mf.id, mf.filename, mf.content, mf.front_matter, mf.created_at,
fts.rank, bm25(fts_files) as score,
snippet(fts_files, 1, '<mark>', '</mark>', '...', 32) as highlight
FROM fts_files fts
JOIN markdown_files mf ON mf.id = fts.rowid
WHERE fts_files MATCH ?
ORDER BY score DESC
LIMIT ? OFFSET ?
""", (query, limit, offset))
results = []
for row in cursor.fetchall():
# Parse front matter
front_matter_raw = {}
if row['front_matter']:
try:
front_matter_raw = json.loads(row['front_matter'])
except json.JSONDecodeError:
pass
results.append({
'type': 'file',
'score': abs(row['score']) if row['score'] else 1.0,
'file': {
'id': row['id'],
'filename': row['filename'],
'content': row['content'],
'front_matter_raw': front_matter_raw,
'created_at': row['created_at']
},
'highlight': row['highlight']
})
return results
def _search_schemas(self, cursor: sqlite3.Cursor, query: str, limit: int, offset: int) -> List[Dict[str, Any]]:
"""Search in schemas using FTS5."""
cursor.execute("""
SELECT
s.id, s.filename, s.title, s.description, s.schema_content,
s.created_at, s.updated_at,
fts.rank, bm25(fts_schemas) as score,
snippet(fts_schemas, 1, '<mark>', '</mark>', '...', 32) as highlight
FROM fts_schemas fts
JOIN schemas s ON s.id = fts.rowid
WHERE fts_schemas MATCH ?
ORDER BY score DESC
LIMIT ? OFFSET ?
""", (query, limit, offset))
results = []
for row in cursor.fetchall():
# Parse schema content
schema_content = {}
if row['schema_content']:
try:
schema_content = json.loads(row['schema_content'])
except json.JSONDecodeError:
pass
results.append({
'type': 'schema',
'score': abs(row['score']) if row['score'] else 1.0,
'schema': {
'id': row['id'],
'filename': row['filename'],
'title': row['title'],
'description': row['description'],
'schema_content': schema_content,
'created_at': row['created_at'],
'updated_at': row['updated_at']
},
'highlight': row['highlight']
})
return results
def _fallback_search(self, cursor: sqlite3.Cursor, query: str, content_type: str, limit: int, offset: int) -> List[Dict[str, Any]]:
"""Fallback to simple LIKE search if FTS5 fails."""
results = []
if content_type in ["all", "files"]:
cursor.execute("""
SELECT id, filename, content, front_matter, created_at
FROM markdown_files
WHERE filename LIKE ? OR content LIKE ?
ORDER BY
CASE WHEN filename LIKE ? THEN 1 ELSE 2 END,
created_at DESC
LIMIT ? OFFSET ?
""", (f"%{query}%", f"%{query}%", f"%{query}%", limit, offset))
for row in cursor.fetchall():
front_matter_raw = {}
if row['front_matter']:
try:
front_matter_raw = json.loads(row['front_matter'])
except json.JSONDecodeError:
pass
results.append({
'type': 'file',
'score': 1.0,
'file': {
'id': row['id'],
'filename': row['filename'],
'content': row['content'],
'front_matter_raw': front_matter_raw,
'created_at': row['created_at']
},
'highlight': self._extract_highlight(row['content'] or '', query)
})
if content_type in ["all", "schemas"]:
cursor.execute("""
SELECT id, filename, title, description, schema_content, created_at, updated_at
FROM schemas
WHERE filename LIKE ? OR title LIKE ? OR description LIKE ?
ORDER BY created_at DESC
LIMIT ? OFFSET ?
""", (f"%{query}%", f"%{query}%", f"%{query}%", limit, offset))
for row in cursor.fetchall():
schema_content = {}
if row['schema_content']:
try:
schema_content = json.loads(row['schema_content'])
except json.JSONDecodeError:
pass
results.append({
'type': 'schema',
'score': 1.0,
'schema': {
'id': row['id'],
'filename': row['filename'],
'title': row['title'],
'description': row['description'],
'schema_content': schema_content,
'created_at': row['created_at'],
'updated_at': row['updated_at']
},
'highlight': self._extract_highlight(row['description'] or '', query)
})
return results
def _extract_highlight(self, text: str, query: str, max_length: int = 100) -> str:
"""Extract highlighted snippet from text."""
if not text or not query:
return ""
query_lower = query.lower()
text_lower = text.lower()
# Find the first occurrence
start = text_lower.find(query_lower)
if start == -1:
return text[:max_length] + "..." if len(text) > max_length else text
# Calculate snippet boundaries
snippet_start = max(0, start - max_length // 4)
snippet_end = min(len(text), start + len(query) + max_length // 2)
snippet = text[snippet_start:snippet_end]
# Add ellipsis if truncated
if snippet_start > 0:
snippet = "..." + snippet
if snippet_end < len(text):
snippet = snippet + "..."
return snippet
def get_search_stats(self, db_path: str) -> Dict[str, Any]:
"""Get search index statistics."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
stats = {}
try:
# Check if FTS tables exist
cursor.execute("""
SELECT name FROM sqlite_master
WHERE type='table' AND name LIKE 'fts_%'
""")
fts_tables = [row[0] for row in cursor.fetchall()]
stats['fts_enabled'] = len(fts_tables) > 0
stats['fts_tables'] = fts_tables
if stats['fts_enabled']:
# Get index statistics
for table in fts_tables:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
stats[f'{table}_documents'] = count
except sqlite3.Error:
stats['fts_enabled'] = False
stats['error'] = "FTS tables not available"
finally:
conn.close()
return stats