feat: implement lightweight full text search plugin using SQLite FTS5 (issue #83)
Added comprehensive full text search capabilities as a lightweight plugin. Key features: - SQLite FTS5-based search engine with no external dependencies - Automatic indexing via database triggers for real-time updates - Advanced query support: phrase search, boolean operators, proximity search - Complete CLI interface with search commands - Graceful fallback to LIKE queries when FTS5 unavailable - Plugin architecture integration for extensibility CLI Commands: - `markitect search init` - Initialize search indexes - `markitect search query` - Perform full text searches - `markitect search status` - View index statistics - `markitect search rebuild` - Rebuild indexes from scratch Search Features: - Content type filtering (files, schemas, all) - Result pagination and formatting options - Query validation and syntax assistance - Performance optimization and index maintenance Technical Implementation: - FTSSearchPlugin: Main search plugin class - SearchIndexer: FTS5 table management and indexing - QueryParser: Query optimization and FTS5 syntax conversion - Comprehensive error handling and fallback mechanisms - 25 test cases covering all functionality Documentation includes complete usage guide and examples. Resolves issue #83: Full text search 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
273
markitect/plugins/builtin/search/query_parser.py
Normal file
273
markitect/plugins/builtin/search/query_parser.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""
|
||||
Query parsing and processing for FTS5 full text search.
|
||||
|
||||
Handles converting user queries into FTS5-compatible syntax and provides
|
||||
query validation and enhancement features.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
|
||||
class QueryParser:
|
||||
"""Parses and processes search queries for FTS5."""
|
||||
|
||||
def __init__(self):
|
||||
# FTS5 operators and syntax
|
||||
self.fts_operators = ['AND', 'OR', 'NOT', 'NEAR']
|
||||
self.fts_special_chars = ['"', '*', '^', '(', ')']
|
||||
|
||||
def parse_query(self, query: str) -> str:
|
||||
"""
|
||||
Parse and convert user query to FTS5-compatible syntax.
|
||||
|
||||
Args:
|
||||
query: Raw user search query
|
||||
|
||||
Returns:
|
||||
FTS5-compatible query string
|
||||
"""
|
||||
if not query or not query.strip():
|
||||
return ""
|
||||
|
||||
# Clean and normalize the query
|
||||
query = query.strip()
|
||||
|
||||
# If query is already using FTS5 syntax, return as-is
|
||||
if self._is_fts5_query(query):
|
||||
return query
|
||||
|
||||
# Convert natural language query to FTS5
|
||||
return self._convert_to_fts5(query)
|
||||
|
||||
def _is_fts5_query(self, query: str) -> bool:
|
||||
"""Check if query already uses FTS5 syntax."""
|
||||
# Look for FTS5 operators or special syntax
|
||||
for operator in self.fts_operators:
|
||||
if f' {operator} ' in query.upper():
|
||||
return True
|
||||
|
||||
# Look for quoted phrases
|
||||
if '"' in query:
|
||||
return True
|
||||
|
||||
# Look for prefix matching
|
||||
if '*' in query:
|
||||
return True
|
||||
|
||||
# Look for column specifications
|
||||
if ':' in query:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def _convert_to_fts5(self, query: str) -> str:
|
||||
"""Convert natural language query to FTS5 syntax."""
|
||||
# Handle quoted phrases - preserve them
|
||||
phrases = []
|
||||
phrase_pattern = r'"([^"]*)"'
|
||||
|
||||
def preserve_phrase(match):
|
||||
phrases.append(match.group(0))
|
||||
return f"__PHRASE_{len(phrases) - 1}__"
|
||||
|
||||
query = re.sub(phrase_pattern, preserve_phrase, query)
|
||||
|
||||
# Split into words, preserving operators
|
||||
words = self._tokenize_query(query)
|
||||
|
||||
# Process each word
|
||||
processed_words = []
|
||||
i = 0
|
||||
while i < len(words):
|
||||
word = words[i].strip()
|
||||
|
||||
if not word:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Restore preserved phrases
|
||||
if word.startswith("__PHRASE_"):
|
||||
phrase_index = int(word.replace("__PHRASE_", "").replace("__", ""))
|
||||
processed_words.append(phrases[phrase_index])
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Handle negation (convert "not" to NOT)
|
||||
if word.lower() in ['not', '-']:
|
||||
if i + 1 < len(words):
|
||||
next_word = words[i + 1].strip()
|
||||
if next_word and not next_word.upper() in self.fts_operators:
|
||||
processed_words.append(f'NOT {self._escape_term(next_word)}')
|
||||
i += 2
|
||||
continue
|
||||
|
||||
# Handle AND/OR operators
|
||||
if word.upper() in self.fts_operators:
|
||||
processed_words.append(word.upper())
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# Handle prefix matching (add * for partial matches)
|
||||
if len(word) >= 3 and word.isalnum():
|
||||
processed_words.append(f'{self._escape_term(word)}*')
|
||||
else:
|
||||
processed_words.append(self._escape_term(word))
|
||||
|
||||
i += 1
|
||||
|
||||
# Join with spaces, but add AND between terms if no operator specified
|
||||
result_parts = []
|
||||
for i, part in enumerate(processed_words):
|
||||
if i > 0 and part.upper() not in self.fts_operators:
|
||||
prev_part = processed_words[i - 1]
|
||||
if prev_part.upper() not in self.fts_operators and not prev_part.startswith('NOT'):
|
||||
result_parts.append('AND')
|
||||
|
||||
result_parts.append(part)
|
||||
|
||||
return ' '.join(result_parts)
|
||||
|
||||
def _tokenize_query(self, query: str) -> List[str]:
|
||||
"""Tokenize query into words and operators."""
|
||||
# Split on whitespace but preserve quoted content
|
||||
tokens = []
|
||||
current_token = ""
|
||||
in_quotes = False
|
||||
|
||||
for char in query:
|
||||
if char == '"':
|
||||
in_quotes = not in_quotes
|
||||
current_token += char
|
||||
elif char.isspace() and not in_quotes:
|
||||
if current_token:
|
||||
tokens.append(current_token)
|
||||
current_token = ""
|
||||
else:
|
||||
current_token += char
|
||||
|
||||
if current_token:
|
||||
tokens.append(current_token)
|
||||
|
||||
return tokens
|
||||
|
||||
def _escape_term(self, term: str) -> str:
|
||||
"""Escape special characters in search terms."""
|
||||
# Escape FTS5 special characters
|
||||
for char in ['"']:
|
||||
term = term.replace(char, '\\' + char)
|
||||
|
||||
return term
|
||||
|
||||
def build_column_query(self, query: str, columns: List[str]) -> str:
|
||||
"""Build FTS5 query targeting specific columns."""
|
||||
if not columns:
|
||||
return query
|
||||
|
||||
# Parse the main query
|
||||
parsed_query = self.parse_query(query)
|
||||
|
||||
# Create column-specific queries
|
||||
column_queries = []
|
||||
for column in columns:
|
||||
column_queries.append(f'{column}:{parsed_query}')
|
||||
|
||||
return ' OR '.join(column_queries)
|
||||
|
||||
def build_phrase_query(self, phrase: str) -> str:
|
||||
"""Build FTS5 query for exact phrase matching."""
|
||||
return f'"{phrase}"'
|
||||
|
||||
def build_proximity_query(self, terms: List[str], distance: int = 10) -> str:
|
||||
"""Build FTS5 NEAR query for proximity searching."""
|
||||
if len(terms) < 2:
|
||||
return ' '.join(terms)
|
||||
|
||||
escaped_terms = [self._escape_term(term) for term in terms]
|
||||
return f'NEAR({" ".join(escaped_terms)}, {distance})'
|
||||
|
||||
def validate_query(self, query: str) -> Tuple[bool, Optional[str]]:
|
||||
"""
|
||||
Validate FTS5 query syntax.
|
||||
|
||||
Returns:
|
||||
Tuple of (is_valid, error_message)
|
||||
"""
|
||||
if not query or not query.strip():
|
||||
return False, "Query cannot be empty"
|
||||
|
||||
# Check for balanced quotes
|
||||
quote_count = query.count('"')
|
||||
if quote_count % 2 != 0:
|
||||
return False, "Unmatched quotes in query"
|
||||
|
||||
# Check for balanced parentheses
|
||||
open_parens = query.count('(')
|
||||
close_parens = query.count(')')
|
||||
if open_parens != close_parens:
|
||||
return False, "Unmatched parentheses in query"
|
||||
|
||||
# Check for empty operators
|
||||
for operator in self.fts_operators:
|
||||
if f' {operator} ' in query.upper():
|
||||
# Make sure operator isn't at start or end
|
||||
if query.upper().startswith(f'{operator} ') or query.upper().endswith(f' {operator}'):
|
||||
return False, f"Operator {operator} cannot be at start or end of query"
|
||||
|
||||
return True, None
|
||||
|
||||
def get_query_terms(self, query: str) -> List[str]:
|
||||
"""Extract individual search terms from query."""
|
||||
# Parse query and extract terms
|
||||
parsed = self.parse_query(query)
|
||||
|
||||
# Remove operators and special syntax
|
||||
terms = []
|
||||
tokens = self._tokenize_query(parsed)
|
||||
|
||||
for token in tokens:
|
||||
token = token.strip()
|
||||
if not token:
|
||||
continue
|
||||
|
||||
# Skip operators
|
||||
if token.upper() in self.fts_operators:
|
||||
continue
|
||||
|
||||
# Remove NOT prefix
|
||||
if token.upper().startswith('NOT '):
|
||||
token = token[4:]
|
||||
|
||||
# Remove quotes
|
||||
token = token.strip('"')
|
||||
|
||||
# Remove prefix wildcard
|
||||
token = token.rstrip('*')
|
||||
|
||||
# Remove column specification
|
||||
if ':' in token:
|
||||
token = token.split(':', 1)[1]
|
||||
|
||||
if token and len(token) > 1:
|
||||
terms.append(token.lower())
|
||||
|
||||
return list(set(terms)) # Remove duplicates
|
||||
|
||||
def suggest_corrections(self, query: str, available_terms: List[str]) -> List[str]:
|
||||
"""Suggest query corrections based on available terms."""
|
||||
suggestions = []
|
||||
query_terms = self.get_query_terms(query)
|
||||
|
||||
for term in query_terms:
|
||||
# Find similar terms using simple string matching
|
||||
matches = []
|
||||
for available in available_terms:
|
||||
if available.lower().startswith(term.lower()):
|
||||
matches.append(available)
|
||||
elif term.lower() in available.lower():
|
||||
matches.append(available)
|
||||
|
||||
if matches:
|
||||
suggestions.extend(matches[:3]) # Limit suggestions
|
||||
|
||||
return list(set(suggestions))[:5] # Return top 5 unique suggestions
|
||||
Reference in New Issue
Block a user