Files
markitect-main/markitect/plugins/builtin/search/query_parser.py
tegwick 8179929a4a feat: implement lightweight full text search plugin using SQLite FTS5 (issue #83)
Added comprehensive full text search capabilities as a lightweight plugin.

Key features:
- SQLite FTS5-based search engine with no external dependencies
- Automatic indexing via database triggers for real-time updates
- Advanced query support: phrase search, boolean operators, proximity search
- Complete CLI interface with search commands
- Graceful fallback to LIKE queries when FTS5 unavailable
- Plugin architecture integration for extensibility

CLI Commands:
- `markitect search init` - Initialize search indexes
- `markitect search query` - Perform full text searches
- `markitect search status` - View index statistics
- `markitect search rebuild` - Rebuild indexes from scratch

Search Features:
- Content type filtering (files, schemas, all)
- Result pagination and formatting options
- Query validation and syntax assistance
- Performance optimization and index maintenance

Technical Implementation:
- FTSSearchPlugin: Main search plugin class
- SearchIndexer: FTS5 table management and indexing
- QueryParser: Query optimization and FTS5 syntax conversion
- Comprehensive error handling and fallback mechanisms
- 25 test cases covering all functionality

Documentation includes complete usage guide and examples.

Resolves issue #83: Full text search

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-03 17:03:11 +02:00

273 lines
8.7 KiB
Python

"""
Query parsing and processing for FTS5 full text search.
Handles converting user queries into FTS5-compatible syntax and provides
query validation and enhancement features.
"""
import re
from typing import List, Dict, Any, Optional, Tuple
class QueryParser:
"""Parses and processes search queries for FTS5."""
def __init__(self):
# FTS5 operators and syntax
self.fts_operators = ['AND', 'OR', 'NOT', 'NEAR']
self.fts_special_chars = ['"', '*', '^', '(', ')']
def parse_query(self, query: str) -> str:
"""
Parse and convert user query to FTS5-compatible syntax.
Args:
query: Raw user search query
Returns:
FTS5-compatible query string
"""
if not query or not query.strip():
return ""
# Clean and normalize the query
query = query.strip()
# If query is already using FTS5 syntax, return as-is
if self._is_fts5_query(query):
return query
# Convert natural language query to FTS5
return self._convert_to_fts5(query)
def _is_fts5_query(self, query: str) -> bool:
"""Check if query already uses FTS5 syntax."""
# Look for FTS5 operators or special syntax
for operator in self.fts_operators:
if f' {operator} ' in query.upper():
return True
# Look for quoted phrases
if '"' in query:
return True
# Look for prefix matching
if '*' in query:
return True
# Look for column specifications
if ':' in query:
return True
return False
def _convert_to_fts5(self, query: str) -> str:
"""Convert natural language query to FTS5 syntax."""
# Handle quoted phrases - preserve them
phrases = []
phrase_pattern = r'"([^"]*)"'
def preserve_phrase(match):
phrases.append(match.group(0))
return f"__PHRASE_{len(phrases) - 1}__"
query = re.sub(phrase_pattern, preserve_phrase, query)
# Split into words, preserving operators
words = self._tokenize_query(query)
# Process each word
processed_words = []
i = 0
while i < len(words):
word = words[i].strip()
if not word:
i += 1
continue
# Restore preserved phrases
if word.startswith("__PHRASE_"):
phrase_index = int(word.replace("__PHRASE_", "").replace("__", ""))
processed_words.append(phrases[phrase_index])
i += 1
continue
# Handle negation (convert "not" to NOT)
if word.lower() in ['not', '-']:
if i + 1 < len(words):
next_word = words[i + 1].strip()
if next_word and not next_word.upper() in self.fts_operators:
processed_words.append(f'NOT {self._escape_term(next_word)}')
i += 2
continue
# Handle AND/OR operators
if word.upper() in self.fts_operators:
processed_words.append(word.upper())
i += 1
continue
# Handle prefix matching (add * for partial matches)
if len(word) >= 3 and word.isalnum():
processed_words.append(f'{self._escape_term(word)}*')
else:
processed_words.append(self._escape_term(word))
i += 1
# Join with spaces, but add AND between terms if no operator specified
result_parts = []
for i, part in enumerate(processed_words):
if i > 0 and part.upper() not in self.fts_operators:
prev_part = processed_words[i - 1]
if prev_part.upper() not in self.fts_operators and not prev_part.startswith('NOT'):
result_parts.append('AND')
result_parts.append(part)
return ' '.join(result_parts)
def _tokenize_query(self, query: str) -> List[str]:
"""Tokenize query into words and operators."""
# Split on whitespace but preserve quoted content
tokens = []
current_token = ""
in_quotes = False
for char in query:
if char == '"':
in_quotes = not in_quotes
current_token += char
elif char.isspace() and not in_quotes:
if current_token:
tokens.append(current_token)
current_token = ""
else:
current_token += char
if current_token:
tokens.append(current_token)
return tokens
def _escape_term(self, term: str) -> str:
"""Escape special characters in search terms."""
# Escape FTS5 special characters
for char in ['"']:
term = term.replace(char, '\\' + char)
return term
def build_column_query(self, query: str, columns: List[str]) -> str:
"""Build FTS5 query targeting specific columns."""
if not columns:
return query
# Parse the main query
parsed_query = self.parse_query(query)
# Create column-specific queries
column_queries = []
for column in columns:
column_queries.append(f'{column}:{parsed_query}')
return ' OR '.join(column_queries)
def build_phrase_query(self, phrase: str) -> str:
"""Build FTS5 query for exact phrase matching."""
return f'"{phrase}"'
def build_proximity_query(self, terms: List[str], distance: int = 10) -> str:
"""Build FTS5 NEAR query for proximity searching."""
if len(terms) < 2:
return ' '.join(terms)
escaped_terms = [self._escape_term(term) for term in terms]
return f'NEAR({" ".join(escaped_terms)}, {distance})'
def validate_query(self, query: str) -> Tuple[bool, Optional[str]]:
"""
Validate FTS5 query syntax.
Returns:
Tuple of (is_valid, error_message)
"""
if not query or not query.strip():
return False, "Query cannot be empty"
# Check for balanced quotes
quote_count = query.count('"')
if quote_count % 2 != 0:
return False, "Unmatched quotes in query"
# Check for balanced parentheses
open_parens = query.count('(')
close_parens = query.count(')')
if open_parens != close_parens:
return False, "Unmatched parentheses in query"
# Check for empty operators
for operator in self.fts_operators:
if f' {operator} ' in query.upper():
# Make sure operator isn't at start or end
if query.upper().startswith(f'{operator} ') or query.upper().endswith(f' {operator}'):
return False, f"Operator {operator} cannot be at start or end of query"
return True, None
def get_query_terms(self, query: str) -> List[str]:
"""Extract individual search terms from query."""
# Parse query and extract terms
parsed = self.parse_query(query)
# Remove operators and special syntax
terms = []
tokens = self._tokenize_query(parsed)
for token in tokens:
token = token.strip()
if not token:
continue
# Skip operators
if token.upper() in self.fts_operators:
continue
# Remove NOT prefix
if token.upper().startswith('NOT '):
token = token[4:]
# Remove quotes
token = token.strip('"')
# Remove prefix wildcard
token = token.rstrip('*')
# Remove column specification
if ':' in token:
token = token.split(':', 1)[1]
if token and len(token) > 1:
terms.append(token.lower())
return list(set(terms)) # Remove duplicates
def suggest_corrections(self, query: str, available_terms: List[str]) -> List[str]:
"""Suggest query corrections based on available terms."""
suggestions = []
query_terms = self.get_query_terms(query)
for term in query_terms:
# Find similar terms using simple string matching
matches = []
for available in available_terms:
if available.lower().startswith(term.lower()):
matches.append(available)
elif term.lower() in available.lower():
matches.append(available)
if matches:
suggestions.extend(matches[:3]) # Limit suggestions
return list(set(suggestions))[:5] # Return top 5 unique suggestions