Added comprehensive full text search capabilities as a lightweight plugin. Key features: - SQLite FTS5-based search engine with no external dependencies - Automatic indexing via database triggers for real-time updates - Advanced query support: phrase search, boolean operators, proximity search - Complete CLI interface with search commands - Graceful fallback to LIKE queries when FTS5 unavailable - Plugin architecture integration for extensibility CLI Commands: - `markitect search init` - Initialize search indexes - `markitect search query` - Perform full text searches - `markitect search status` - View index statistics - `markitect search rebuild` - Rebuild indexes from scratch Search Features: - Content type filtering (files, schemas, all) - Result pagination and formatting options - Query validation and syntax assistance - Performance optimization and index maintenance Technical Implementation: - FTSSearchPlugin: Main search plugin class - SearchIndexer: FTS5 table management and indexing - QueryParser: Query optimization and FTS5 syntax conversion - Comprehensive error handling and fallback mechanisms - 25 test cases covering all functionality Documentation includes complete usage guide and examples. Resolves issue #83: Full text search 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
273 lines
8.7 KiB
Python
273 lines
8.7 KiB
Python
"""
|
|
Query parsing and processing for FTS5 full text search.
|
|
|
|
Handles converting user queries into FTS5-compatible syntax and provides
|
|
query validation and enhancement features.
|
|
"""
|
|
|
|
import re
|
|
from typing import List, Dict, Any, Optional, Tuple
|
|
|
|
|
|
class QueryParser:
|
|
"""Parses and processes search queries for FTS5."""
|
|
|
|
def __init__(self):
|
|
# FTS5 operators and syntax
|
|
self.fts_operators = ['AND', 'OR', 'NOT', 'NEAR']
|
|
self.fts_special_chars = ['"', '*', '^', '(', ')']
|
|
|
|
def parse_query(self, query: str) -> str:
|
|
"""
|
|
Parse and convert user query to FTS5-compatible syntax.
|
|
|
|
Args:
|
|
query: Raw user search query
|
|
|
|
Returns:
|
|
FTS5-compatible query string
|
|
"""
|
|
if not query or not query.strip():
|
|
return ""
|
|
|
|
# Clean and normalize the query
|
|
query = query.strip()
|
|
|
|
# If query is already using FTS5 syntax, return as-is
|
|
if self._is_fts5_query(query):
|
|
return query
|
|
|
|
# Convert natural language query to FTS5
|
|
return self._convert_to_fts5(query)
|
|
|
|
def _is_fts5_query(self, query: str) -> bool:
|
|
"""Check if query already uses FTS5 syntax."""
|
|
# Look for FTS5 operators or special syntax
|
|
for operator in self.fts_operators:
|
|
if f' {operator} ' in query.upper():
|
|
return True
|
|
|
|
# Look for quoted phrases
|
|
if '"' in query:
|
|
return True
|
|
|
|
# Look for prefix matching
|
|
if '*' in query:
|
|
return True
|
|
|
|
# Look for column specifications
|
|
if ':' in query:
|
|
return True
|
|
|
|
return False
|
|
|
|
def _convert_to_fts5(self, query: str) -> str:
|
|
"""Convert natural language query to FTS5 syntax."""
|
|
# Handle quoted phrases - preserve them
|
|
phrases = []
|
|
phrase_pattern = r'"([^"]*)"'
|
|
|
|
def preserve_phrase(match):
|
|
phrases.append(match.group(0))
|
|
return f"__PHRASE_{len(phrases) - 1}__"
|
|
|
|
query = re.sub(phrase_pattern, preserve_phrase, query)
|
|
|
|
# Split into words, preserving operators
|
|
words = self._tokenize_query(query)
|
|
|
|
# Process each word
|
|
processed_words = []
|
|
i = 0
|
|
while i < len(words):
|
|
word = words[i].strip()
|
|
|
|
if not word:
|
|
i += 1
|
|
continue
|
|
|
|
# Restore preserved phrases
|
|
if word.startswith("__PHRASE_"):
|
|
phrase_index = int(word.replace("__PHRASE_", "").replace("__", ""))
|
|
processed_words.append(phrases[phrase_index])
|
|
i += 1
|
|
continue
|
|
|
|
# Handle negation (convert "not" to NOT)
|
|
if word.lower() in ['not', '-']:
|
|
if i + 1 < len(words):
|
|
next_word = words[i + 1].strip()
|
|
if next_word and not next_word.upper() in self.fts_operators:
|
|
processed_words.append(f'NOT {self._escape_term(next_word)}')
|
|
i += 2
|
|
continue
|
|
|
|
# Handle AND/OR operators
|
|
if word.upper() in self.fts_operators:
|
|
processed_words.append(word.upper())
|
|
i += 1
|
|
continue
|
|
|
|
# Handle prefix matching (add * for partial matches)
|
|
if len(word) >= 3 and word.isalnum():
|
|
processed_words.append(f'{self._escape_term(word)}*')
|
|
else:
|
|
processed_words.append(self._escape_term(word))
|
|
|
|
i += 1
|
|
|
|
# Join with spaces, but add AND between terms if no operator specified
|
|
result_parts = []
|
|
for i, part in enumerate(processed_words):
|
|
if i > 0 and part.upper() not in self.fts_operators:
|
|
prev_part = processed_words[i - 1]
|
|
if prev_part.upper() not in self.fts_operators and not prev_part.startswith('NOT'):
|
|
result_parts.append('AND')
|
|
|
|
result_parts.append(part)
|
|
|
|
return ' '.join(result_parts)
|
|
|
|
def _tokenize_query(self, query: str) -> List[str]:
|
|
"""Tokenize query into words and operators."""
|
|
# Split on whitespace but preserve quoted content
|
|
tokens = []
|
|
current_token = ""
|
|
in_quotes = False
|
|
|
|
for char in query:
|
|
if char == '"':
|
|
in_quotes = not in_quotes
|
|
current_token += char
|
|
elif char.isspace() and not in_quotes:
|
|
if current_token:
|
|
tokens.append(current_token)
|
|
current_token = ""
|
|
else:
|
|
current_token += char
|
|
|
|
if current_token:
|
|
tokens.append(current_token)
|
|
|
|
return tokens
|
|
|
|
def _escape_term(self, term: str) -> str:
|
|
"""Escape special characters in search terms."""
|
|
# Escape FTS5 special characters
|
|
for char in ['"']:
|
|
term = term.replace(char, '\\' + char)
|
|
|
|
return term
|
|
|
|
def build_column_query(self, query: str, columns: List[str]) -> str:
|
|
"""Build FTS5 query targeting specific columns."""
|
|
if not columns:
|
|
return query
|
|
|
|
# Parse the main query
|
|
parsed_query = self.parse_query(query)
|
|
|
|
# Create column-specific queries
|
|
column_queries = []
|
|
for column in columns:
|
|
column_queries.append(f'{column}:{parsed_query}')
|
|
|
|
return ' OR '.join(column_queries)
|
|
|
|
def build_phrase_query(self, phrase: str) -> str:
|
|
"""Build FTS5 query for exact phrase matching."""
|
|
return f'"{phrase}"'
|
|
|
|
def build_proximity_query(self, terms: List[str], distance: int = 10) -> str:
|
|
"""Build FTS5 NEAR query for proximity searching."""
|
|
if len(terms) < 2:
|
|
return ' '.join(terms)
|
|
|
|
escaped_terms = [self._escape_term(term) for term in terms]
|
|
return f'NEAR({" ".join(escaped_terms)}, {distance})'
|
|
|
|
def validate_query(self, query: str) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Validate FTS5 query syntax.
|
|
|
|
Returns:
|
|
Tuple of (is_valid, error_message)
|
|
"""
|
|
if not query or not query.strip():
|
|
return False, "Query cannot be empty"
|
|
|
|
# Check for balanced quotes
|
|
quote_count = query.count('"')
|
|
if quote_count % 2 != 0:
|
|
return False, "Unmatched quotes in query"
|
|
|
|
# Check for balanced parentheses
|
|
open_parens = query.count('(')
|
|
close_parens = query.count(')')
|
|
if open_parens != close_parens:
|
|
return False, "Unmatched parentheses in query"
|
|
|
|
# Check for empty operators
|
|
for operator in self.fts_operators:
|
|
if f' {operator} ' in query.upper():
|
|
# Make sure operator isn't at start or end
|
|
if query.upper().startswith(f'{operator} ') or query.upper().endswith(f' {operator}'):
|
|
return False, f"Operator {operator} cannot be at start or end of query"
|
|
|
|
return True, None
|
|
|
|
def get_query_terms(self, query: str) -> List[str]:
|
|
"""Extract individual search terms from query."""
|
|
# Parse query and extract terms
|
|
parsed = self.parse_query(query)
|
|
|
|
# Remove operators and special syntax
|
|
terms = []
|
|
tokens = self._tokenize_query(parsed)
|
|
|
|
for token in tokens:
|
|
token = token.strip()
|
|
if not token:
|
|
continue
|
|
|
|
# Skip operators
|
|
if token.upper() in self.fts_operators:
|
|
continue
|
|
|
|
# Remove NOT prefix
|
|
if token.upper().startswith('NOT '):
|
|
token = token[4:]
|
|
|
|
# Remove quotes
|
|
token = token.strip('"')
|
|
|
|
# Remove prefix wildcard
|
|
token = token.rstrip('*')
|
|
|
|
# Remove column specification
|
|
if ':' in token:
|
|
token = token.split(':', 1)[1]
|
|
|
|
if token and len(token) > 1:
|
|
terms.append(token.lower())
|
|
|
|
return list(set(terms)) # Remove duplicates
|
|
|
|
def suggest_corrections(self, query: str, available_terms: List[str]) -> List[str]:
|
|
"""Suggest query corrections based on available terms."""
|
|
suggestions = []
|
|
query_terms = self.get_query_terms(query)
|
|
|
|
for term in query_terms:
|
|
# Find similar terms using simple string matching
|
|
matches = []
|
|
for available in available_terms:
|
|
if available.lower().startswith(term.lower()):
|
|
matches.append(available)
|
|
elif term.lower() in available.lower():
|
|
matches.append(available)
|
|
|
|
if matches:
|
|
suggestions.extend(matches[:3]) # Limit suggestions
|
|
|
|
return list(set(suggestions))[:5] # Return top 5 unique suggestions |