feat: implement lightweight full text search plugin using SQLite FTS5 (issue #83)

Added comprehensive full text search capabilities as a lightweight plugin.

Key features:
- SQLite FTS5-based search engine with no external dependencies
- Automatic indexing via database triggers for real-time updates
- Advanced query support: phrase search, boolean operators, proximity search
- Complete CLI interface with search commands
- Graceful fallback to LIKE queries when FTS5 unavailable
- Plugin architecture integration for extensibility

CLI Commands:
- `markitect search init` - Initialize search indexes
- `markitect search query` - Perform full text searches
- `markitect search status` - View index statistics
- `markitect search rebuild` - Rebuild indexes from scratch

Search Features:
- Content type filtering (files, schemas, all)
- Result pagination and formatting options
- Query validation and syntax assistance
- Performance optimization and index maintenance

Technical Implementation:
- FTSSearchPlugin: Main search plugin class
- SearchIndexer: FTS5 table management and indexing
- QueryParser: Query optimization and FTS5 syntax conversion
- Comprehensive error handling and fallback mechanisms
- 25 test cases covering all functionality

Documentation includes complete usage guide and examples.

Resolves issue #83: Full text search

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-03 17:03:11 +02:00
parent 2a15dde228
commit 8179929a4a
7 changed files with 1994 additions and 0 deletions

View File

@@ -0,0 +1,12 @@
"""
Full text search plugin for MarkiTect using SQLite FTS5.
Provides lightweight, high-performance full text search capabilities
as a plugin to the MarkiTect system.
"""
from .fts_search import FTSSearchPlugin
from .indexer import SearchIndexer
from .query_parser import QueryParser
__all__ = ['FTSSearchPlugin', 'SearchIndexer', 'QueryParser']

View File

@@ -0,0 +1,307 @@
"""
SQLite FTS5 full text search plugin for MarkiTect.
Provides advanced full text search capabilities using SQLite's built-in
FTS5 virtual table extension for lightweight, high-performance search.
"""
import sqlite3
import json
from typing import Dict, Any, List, Optional, Tuple
from pathlib import Path
from ...base import BasePlugin, PluginMetadata, PluginType
from ...decorators import register_plugin
from .indexer import SearchIndexer
from .query_parser import QueryParser
@register_plugin("fts_search")
class FTSSearchPlugin(BasePlugin):
"""Full Text Search plugin using SQLite FTS5."""
def __init__(self):
super().__init__()
self.indexer = SearchIndexer()
self.query_parser = QueryParser()
@property
def metadata(self) -> PluginMetadata:
return PluginMetadata(
name="fts_search",
version="1.0.0",
description="Full text search using SQLite FTS5",
author="MarkiTect Team",
plugin_type=PluginType.EXTENSION
)
def initialize(self, db_path: str) -> None:
"""Initialize FTS5 search tables and indexes."""
self.db_path = db_path
self.indexer.initialize_fts_tables(db_path)
def rebuild_index(self, db_path: str) -> Dict[str, int]:
"""Rebuild the full text search index."""
return self.indexer.rebuild_index(db_path)
def search(self,
db_path: str,
query: str,
content_type: str = "all",
limit: int = 20,
offset: int = 0) -> List[Dict[str, Any]]:
"""
Perform full text search.
Args:
db_path: Path to SQLite database
query: Search query (supports FTS5 syntax)
content_type: Type of content to search ("all", "files", "schemas")
limit: Maximum number of results
offset: Result offset for pagination
Returns:
List of search results with relevance scores
"""
conn = sqlite3.connect(db_path)
conn.row_factory = sqlite3.Row
cursor = conn.cursor()
results = []
try:
# Parse and validate query
parsed_query = self.query_parser.parse_query(query)
if content_type in ["all", "files"]:
results.extend(self._search_files(cursor, parsed_query, limit, offset))
if content_type in ["all", "schemas"]:
results.extend(self._search_schemas(cursor, parsed_query, limit, offset))
# Sort by relevance score and apply global limit
results.sort(key=lambda x: x.get('score', 0), reverse=True)
if content_type == "all":
results = results[:limit]
except Exception as e:
# Fall back to simple LIKE search if FTS5 fails
results = self._fallback_search(cursor, query, content_type, limit, offset)
finally:
conn.close()
return results
def _search_files(self, cursor: sqlite3.Cursor, query: str, limit: int, offset: int) -> List[Dict[str, Any]]:
"""Search in markdown files using FTS5."""
cursor.execute("""
SELECT
mf.id, mf.filename, mf.content, mf.front_matter, mf.created_at,
fts.rank, bm25(fts_files) as score,
snippet(fts_files, 1, '<mark>', '</mark>', '...', 32) as highlight
FROM fts_files fts
JOIN markdown_files mf ON mf.id = fts.rowid
WHERE fts_files MATCH ?
ORDER BY score DESC
LIMIT ? OFFSET ?
""", (query, limit, offset))
results = []
for row in cursor.fetchall():
# Parse front matter
front_matter_raw = {}
if row['front_matter']:
try:
front_matter_raw = json.loads(row['front_matter'])
except json.JSONDecodeError:
pass
results.append({
'type': 'file',
'score': abs(row['score']) if row['score'] else 1.0,
'file': {
'id': row['id'],
'filename': row['filename'],
'content': row['content'],
'front_matter_raw': front_matter_raw,
'created_at': row['created_at']
},
'highlight': row['highlight']
})
return results
def _search_schemas(self, cursor: sqlite3.Cursor, query: str, limit: int, offset: int) -> List[Dict[str, Any]]:
"""Search in schemas using FTS5."""
cursor.execute("""
SELECT
s.id, s.filename, s.title, s.description, s.schema_content,
s.created_at, s.updated_at,
fts.rank, bm25(fts_schemas) as score,
snippet(fts_schemas, 1, '<mark>', '</mark>', '...', 32) as highlight
FROM fts_schemas fts
JOIN schemas s ON s.id = fts.rowid
WHERE fts_schemas MATCH ?
ORDER BY score DESC
LIMIT ? OFFSET ?
""", (query, limit, offset))
results = []
for row in cursor.fetchall():
# Parse schema content
schema_content = {}
if row['schema_content']:
try:
schema_content = json.loads(row['schema_content'])
except json.JSONDecodeError:
pass
results.append({
'type': 'schema',
'score': abs(row['score']) if row['score'] else 1.0,
'schema': {
'id': row['id'],
'filename': row['filename'],
'title': row['title'],
'description': row['description'],
'schema_content': schema_content,
'created_at': row['created_at'],
'updated_at': row['updated_at']
},
'highlight': row['highlight']
})
return results
def _fallback_search(self, cursor: sqlite3.Cursor, query: str, content_type: str, limit: int, offset: int) -> List[Dict[str, Any]]:
"""Fallback to simple LIKE search if FTS5 fails."""
results = []
if content_type in ["all", "files"]:
cursor.execute("""
SELECT id, filename, content, front_matter, created_at
FROM markdown_files
WHERE filename LIKE ? OR content LIKE ?
ORDER BY
CASE WHEN filename LIKE ? THEN 1 ELSE 2 END,
created_at DESC
LIMIT ? OFFSET ?
""", (f"%{query}%", f"%{query}%", f"%{query}%", limit, offset))
for row in cursor.fetchall():
front_matter_raw = {}
if row['front_matter']:
try:
front_matter_raw = json.loads(row['front_matter'])
except json.JSONDecodeError:
pass
results.append({
'type': 'file',
'score': 1.0,
'file': {
'id': row['id'],
'filename': row['filename'],
'content': row['content'],
'front_matter_raw': front_matter_raw,
'created_at': row['created_at']
},
'highlight': self._extract_highlight(row['content'] or '', query)
})
if content_type in ["all", "schemas"]:
cursor.execute("""
SELECT id, filename, title, description, schema_content, created_at, updated_at
FROM schemas
WHERE filename LIKE ? OR title LIKE ? OR description LIKE ?
ORDER BY created_at DESC
LIMIT ? OFFSET ?
""", (f"%{query}%", f"%{query}%", f"%{query}%", limit, offset))
for row in cursor.fetchall():
schema_content = {}
if row['schema_content']:
try:
schema_content = json.loads(row['schema_content'])
except json.JSONDecodeError:
pass
results.append({
'type': 'schema',
'score': 1.0,
'schema': {
'id': row['id'],
'filename': row['filename'],
'title': row['title'],
'description': row['description'],
'schema_content': schema_content,
'created_at': row['created_at'],
'updated_at': row['updated_at']
},
'highlight': self._extract_highlight(row['description'] or '', query)
})
return results
def _extract_highlight(self, text: str, query: str, max_length: int = 100) -> str:
"""Extract highlighted snippet from text."""
if not text or not query:
return ""
query_lower = query.lower()
text_lower = text.lower()
# Find the first occurrence
start = text_lower.find(query_lower)
if start == -1:
return text[:max_length] + "..." if len(text) > max_length else text
# Calculate snippet boundaries
snippet_start = max(0, start - max_length // 4)
snippet_end = min(len(text), start + len(query) + max_length // 2)
snippet = text[snippet_start:snippet_end]
# Add ellipsis if truncated
if snippet_start > 0:
snippet = "..." + snippet
if snippet_end < len(text):
snippet = snippet + "..."
return snippet
def get_search_stats(self, db_path: str) -> Dict[str, Any]:
"""Get search index statistics."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
stats = {}
try:
# Check if FTS tables exist
cursor.execute("""
SELECT name FROM sqlite_master
WHERE type='table' AND name LIKE 'fts_%'
""")
fts_tables = [row[0] for row in cursor.fetchall()]
stats['fts_enabled'] = len(fts_tables) > 0
stats['fts_tables'] = fts_tables
if stats['fts_enabled']:
# Get index statistics
for table in fts_tables:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
count = cursor.fetchone()[0]
stats[f'{table}_documents'] = count
except sqlite3.Error:
stats['fts_enabled'] = False
stats['error'] = "FTS tables not available"
finally:
conn.close()
return stats

View File

@@ -0,0 +1,225 @@
"""
Search indexing functionality using SQLite FTS5.
Handles creating and maintaining full text search indexes for MarkiTect content.
"""
import sqlite3
import json
from typing import Dict, Any, Optional
from pathlib import Path
class SearchIndexer:
"""Manages full text search indexes using SQLite FTS5."""
def initialize_fts_tables(self, db_path: str) -> None:
"""Initialize FTS5 virtual tables for full text search."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
try:
# Create FTS5 table for markdown files
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS fts_files USING fts5(
filename,
content,
front_matter,
content='markdown_files',
content_rowid='id'
)
""")
# Create FTS5 table for schemas
cursor.execute("""
CREATE VIRTUAL TABLE IF NOT EXISTS fts_schemas USING fts5(
filename,
title,
description,
content='schemas',
content_rowid='id'
)
""")
# Create triggers to keep FTS5 indexes synchronized
self._create_fts_triggers(cursor)
conn.commit()
except sqlite3.Error as e:
# If FTS5 is not available, create a fallback indicator
cursor.execute("""
CREATE TABLE IF NOT EXISTS fts_status (
fts_enabled INTEGER DEFAULT 0,
error_message TEXT
)
""")
cursor.execute("""
INSERT OR REPLACE INTO fts_status (fts_enabled, error_message)
VALUES (0, ?)
""", (str(e),))
conn.commit()
finally:
conn.close()
def _create_fts_triggers(self, cursor: sqlite3.Cursor) -> None:
"""Create triggers to automatically maintain FTS5 indexes."""
# Triggers for markdown_files table
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS fts_files_insert AFTER INSERT ON markdown_files BEGIN
INSERT INTO fts_files(rowid, filename, content, front_matter)
VALUES (new.id, new.filename, new.content, new.front_matter);
END
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS fts_files_delete AFTER DELETE ON markdown_files BEGIN
INSERT INTO fts_files(fts_files, rowid, filename, content, front_matter)
VALUES('delete', old.id, old.filename, old.content, old.front_matter);
END
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS fts_files_update AFTER UPDATE ON markdown_files BEGIN
INSERT INTO fts_files(fts_files, rowid, filename, content, front_matter)
VALUES('delete', old.id, old.filename, old.content, old.front_matter);
INSERT INTO fts_files(rowid, filename, content, front_matter)
VALUES (new.id, new.filename, new.content, new.front_matter);
END
""")
# Triggers for schemas table
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS fts_schemas_insert AFTER INSERT ON schemas BEGIN
INSERT INTO fts_schemas(rowid, filename, title, description)
VALUES (new.id, new.filename, new.title, new.description);
END
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS fts_schemas_delete AFTER DELETE ON schemas BEGIN
INSERT INTO fts_schemas(fts_schemas, rowid, filename, title, description)
VALUES('delete', old.id, old.filename, old.title, old.description);
END
""")
cursor.execute("""
CREATE TRIGGER IF NOT EXISTS fts_schemas_update AFTER UPDATE ON schemas BEGIN
INSERT INTO fts_schemas(fts_schemas, rowid, filename, title, description)
VALUES('delete', old.id, old.filename, old.title, old.description);
INSERT INTO fts_schemas(rowid, filename, title, description)
VALUES (new.id, new.filename, new.title, new.description);
END
""")
def rebuild_index(self, db_path: str) -> Dict[str, int]:
"""Rebuild the full text search index from scratch."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
stats = {'files_indexed': 0, 'schemas_indexed': 0}
try:
# Clear existing FTS5 data
cursor.execute("DELETE FROM fts_files")
cursor.execute("DELETE FROM fts_schemas")
# Rebuild files index
cursor.execute("""
INSERT INTO fts_files(rowid, filename, content, front_matter)
SELECT id, filename, content, front_matter FROM markdown_files
""")
stats['files_indexed'] = cursor.rowcount
# Rebuild schemas index
cursor.execute("""
INSERT INTO fts_schemas(rowid, filename, title, description)
SELECT id, filename, title, description FROM schemas
""")
stats['schemas_indexed'] = cursor.rowcount
# Optimize the FTS5 indexes
cursor.execute("INSERT INTO fts_files(fts_files) VALUES('optimize')")
cursor.execute("INSERT INTO fts_schemas(fts_schemas) VALUES('optimize')")
conn.commit()
except sqlite3.Error as e:
stats['error'] = str(e)
conn.rollback()
finally:
conn.close()
return stats
def optimize_index(self, db_path: str) -> None:
"""Optimize FTS5 indexes for better performance."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
try:
cursor.execute("INSERT INTO fts_files(fts_files) VALUES('optimize')")
cursor.execute("INSERT INTO fts_schemas(fts_schemas) VALUES('optimize')")
conn.commit()
except sqlite3.Error:
pass
finally:
conn.close()
def get_index_info(self, db_path: str) -> Dict[str, Any]:
"""Get information about the current search indexes."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
info = {}
try:
# Check if FTS tables exist
cursor.execute("""
SELECT name FROM sqlite_master
WHERE type='table' AND name LIKE 'fts_%'
""")
fts_tables = [row[0] for row in cursor.fetchall()]
info['fts_tables'] = fts_tables
info['fts_enabled'] = len(fts_tables) > 0
if info['fts_enabled']:
# Get document counts
for table in ['fts_files', 'fts_schemas']:
if table in fts_tables:
cursor.execute(f"SELECT COUNT(*) FROM {table}")
info[f'{table}_count'] = cursor.fetchone()[0]
# Get FTS5 integrity check
try:
cursor.execute("INSERT INTO fts_files(fts_files) VALUES('integrity-check')")
cursor.execute("INSERT INTO fts_schemas(fts_schemas) VALUES('integrity-check')")
info['integrity_check'] = 'passed'
except sqlite3.Error as e:
info['integrity_check'] = f'failed: {str(e)}'
except sqlite3.Error as e:
info['error'] = str(e)
info['fts_enabled'] = False
finally:
conn.close()
return info
def check_fts_availability(self, db_path: str) -> bool:
"""Check if FTS5 is available in SQLite."""
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
try:
cursor.execute("CREATE VIRTUAL TABLE IF NOT EXISTS fts_test USING fts5(content)")
cursor.execute("DROP TABLE fts_test")
return True
except sqlite3.Error:
return False
finally:
conn.close()

View File

@@ -0,0 +1,273 @@
"""
Query parsing and processing for FTS5 full text search.
Handles converting user queries into FTS5-compatible syntax and provides
query validation and enhancement features.
"""
import re
from typing import List, Dict, Any, Optional, Tuple
class QueryParser:
"""Parses and processes search queries for FTS5."""
def __init__(self):
# FTS5 operators and syntax
self.fts_operators = ['AND', 'OR', 'NOT', 'NEAR']
self.fts_special_chars = ['"', '*', '^', '(', ')']
def parse_query(self, query: str) -> str:
"""
Parse and convert user query to FTS5-compatible syntax.
Args:
query: Raw user search query
Returns:
FTS5-compatible query string
"""
if not query or not query.strip():
return ""
# Clean and normalize the query
query = query.strip()
# If query is already using FTS5 syntax, return as-is
if self._is_fts5_query(query):
return query
# Convert natural language query to FTS5
return self._convert_to_fts5(query)
def _is_fts5_query(self, query: str) -> bool:
"""Check if query already uses FTS5 syntax."""
# Look for FTS5 operators or special syntax
for operator in self.fts_operators:
if f' {operator} ' in query.upper():
return True
# Look for quoted phrases
if '"' in query:
return True
# Look for prefix matching
if '*' in query:
return True
# Look for column specifications
if ':' in query:
return True
return False
def _convert_to_fts5(self, query: str) -> str:
"""Convert natural language query to FTS5 syntax."""
# Handle quoted phrases - preserve them
phrases = []
phrase_pattern = r'"([^"]*)"'
def preserve_phrase(match):
phrases.append(match.group(0))
return f"__PHRASE_{len(phrases) - 1}__"
query = re.sub(phrase_pattern, preserve_phrase, query)
# Split into words, preserving operators
words = self._tokenize_query(query)
# Process each word
processed_words = []
i = 0
while i < len(words):
word = words[i].strip()
if not word:
i += 1
continue
# Restore preserved phrases
if word.startswith("__PHRASE_"):
phrase_index = int(word.replace("__PHRASE_", "").replace("__", ""))
processed_words.append(phrases[phrase_index])
i += 1
continue
# Handle negation (convert "not" to NOT)
if word.lower() in ['not', '-']:
if i + 1 < len(words):
next_word = words[i + 1].strip()
if next_word and not next_word.upper() in self.fts_operators:
processed_words.append(f'NOT {self._escape_term(next_word)}')
i += 2
continue
# Handle AND/OR operators
if word.upper() in self.fts_operators:
processed_words.append(word.upper())
i += 1
continue
# Handle prefix matching (add * for partial matches)
if len(word) >= 3 and word.isalnum():
processed_words.append(f'{self._escape_term(word)}*')
else:
processed_words.append(self._escape_term(word))
i += 1
# Join with spaces, but add AND between terms if no operator specified
result_parts = []
for i, part in enumerate(processed_words):
if i > 0 and part.upper() not in self.fts_operators:
prev_part = processed_words[i - 1]
if prev_part.upper() not in self.fts_operators and not prev_part.startswith('NOT'):
result_parts.append('AND')
result_parts.append(part)
return ' '.join(result_parts)
def _tokenize_query(self, query: str) -> List[str]:
"""Tokenize query into words and operators."""
# Split on whitespace but preserve quoted content
tokens = []
current_token = ""
in_quotes = False
for char in query:
if char == '"':
in_quotes = not in_quotes
current_token += char
elif char.isspace() and not in_quotes:
if current_token:
tokens.append(current_token)
current_token = ""
else:
current_token += char
if current_token:
tokens.append(current_token)
return tokens
def _escape_term(self, term: str) -> str:
"""Escape special characters in search terms."""
# Escape FTS5 special characters
for char in ['"']:
term = term.replace(char, '\\' + char)
return term
def build_column_query(self, query: str, columns: List[str]) -> str:
"""Build FTS5 query targeting specific columns."""
if not columns:
return query
# Parse the main query
parsed_query = self.parse_query(query)
# Create column-specific queries
column_queries = []
for column in columns:
column_queries.append(f'{column}:{parsed_query}')
return ' OR '.join(column_queries)
def build_phrase_query(self, phrase: str) -> str:
"""Build FTS5 query for exact phrase matching."""
return f'"{phrase}"'
def build_proximity_query(self, terms: List[str], distance: int = 10) -> str:
"""Build FTS5 NEAR query for proximity searching."""
if len(terms) < 2:
return ' '.join(terms)
escaped_terms = [self._escape_term(term) for term in terms]
return f'NEAR({" ".join(escaped_terms)}, {distance})'
def validate_query(self, query: str) -> Tuple[bool, Optional[str]]:
"""
Validate FTS5 query syntax.
Returns:
Tuple of (is_valid, error_message)
"""
if not query or not query.strip():
return False, "Query cannot be empty"
# Check for balanced quotes
quote_count = query.count('"')
if quote_count % 2 != 0:
return False, "Unmatched quotes in query"
# Check for balanced parentheses
open_parens = query.count('(')
close_parens = query.count(')')
if open_parens != close_parens:
return False, "Unmatched parentheses in query"
# Check for empty operators
for operator in self.fts_operators:
if f' {operator} ' in query.upper():
# Make sure operator isn't at start or end
if query.upper().startswith(f'{operator} ') or query.upper().endswith(f' {operator}'):
return False, f"Operator {operator} cannot be at start or end of query"
return True, None
def get_query_terms(self, query: str) -> List[str]:
"""Extract individual search terms from query."""
# Parse query and extract terms
parsed = self.parse_query(query)
# Remove operators and special syntax
terms = []
tokens = self._tokenize_query(parsed)
for token in tokens:
token = token.strip()
if not token:
continue
# Skip operators
if token.upper() in self.fts_operators:
continue
# Remove NOT prefix
if token.upper().startswith('NOT '):
token = token[4:]
# Remove quotes
token = token.strip('"')
# Remove prefix wildcard
token = token.rstrip('*')
# Remove column specification
if ':' in token:
token = token.split(':', 1)[1]
if token and len(token) > 1:
terms.append(token.lower())
return list(set(terms)) # Remove duplicates
def suggest_corrections(self, query: str, available_terms: List[str]) -> List[str]:
"""Suggest query corrections based on available terms."""
suggestions = []
query_terms = self.get_query_terms(query)
for term in query_terms:
# Find similar terms using simple string matching
matches = []
for available in available_terms:
if available.lower().startswith(term.lower()):
matches.append(available)
elif term.lower() in available.lower():
matches.append(available)
if matches:
suggestions.extend(matches[:3]) # Limit suggestions
return list(set(suggestions))[:5] # Return top 5 unique suggestions