""" Query parsing and processing for FTS5 full text search. Handles converting user queries into FTS5-compatible syntax and provides query validation and enhancement features. """ import re from typing import List, Dict, Any, Optional, Tuple class QueryParser: """Parses and processes search queries for FTS5.""" def __init__(self): # FTS5 operators and syntax self.fts_operators = ['AND', 'OR', 'NOT', 'NEAR'] self.fts_special_chars = ['"', '*', '^', '(', ')'] def parse_query(self, query: str) -> str: """ Parse and convert user query to FTS5-compatible syntax. Args: query: Raw user search query Returns: FTS5-compatible query string """ if not query or not query.strip(): return "" # Clean and normalize the query query = query.strip() # If query is already using FTS5 syntax, return as-is if self._is_fts5_query(query): return query # Convert natural language query to FTS5 return self._convert_to_fts5(query) def _is_fts5_query(self, query: str) -> bool: """Check if query already uses FTS5 syntax.""" # Look for FTS5 operators or special syntax for operator in self.fts_operators: if f' {operator} ' in query.upper(): return True # Look for quoted phrases if '"' in query: return True # Look for prefix matching if '*' in query: return True # Look for column specifications if ':' in query: return True return False def _convert_to_fts5(self, query: str) -> str: """Convert natural language query to FTS5 syntax.""" # Handle quoted phrases - preserve them phrases = [] phrase_pattern = r'"([^"]*)"' def preserve_phrase(match): phrases.append(match.group(0)) return f"__PHRASE_{len(phrases) - 1}__" query = re.sub(phrase_pattern, preserve_phrase, query) # Split into words, preserving operators words = self._tokenize_query(query) # Process each word processed_words = [] i = 0 while i < len(words): word = words[i].strip() if not word: i += 1 continue # Restore preserved phrases if word.startswith("__PHRASE_"): phrase_index = int(word.replace("__PHRASE_", "").replace("__", "")) processed_words.append(phrases[phrase_index]) i += 1 continue # Handle negation (convert "not" to NOT) if word.lower() in ['not', '-']: if i + 1 < len(words): next_word = words[i + 1].strip() if next_word and not next_word.upper() in self.fts_operators: processed_words.append(f'NOT {self._escape_term(next_word)}') i += 2 continue # Handle AND/OR operators if word.upper() in self.fts_operators: processed_words.append(word.upper()) i += 1 continue # Handle prefix matching (add * for partial matches) if len(word) >= 3 and word.isalnum(): processed_words.append(f'{self._escape_term(word)}*') else: processed_words.append(self._escape_term(word)) i += 1 # Join with spaces, but add AND between terms if no operator specified result_parts = [] for i, part in enumerate(processed_words): if i > 0 and part.upper() not in self.fts_operators: prev_part = processed_words[i - 1] if prev_part.upper() not in self.fts_operators and not prev_part.startswith('NOT'): result_parts.append('AND') result_parts.append(part) return ' '.join(result_parts) def _tokenize_query(self, query: str) -> List[str]: """Tokenize query into words and operators.""" # Split on whitespace but preserve quoted content tokens = [] current_token = "" in_quotes = False for char in query: if char == '"': in_quotes = not in_quotes current_token += char elif char.isspace() and not in_quotes: if current_token: tokens.append(current_token) current_token = "" else: current_token += char if current_token: tokens.append(current_token) return tokens def _escape_term(self, term: str) -> str: """Escape special characters in search terms.""" # Escape FTS5 special characters for char in ['"']: term = term.replace(char, '\\' + char) return term def build_column_query(self, query: str, columns: List[str]) -> str: """Build FTS5 query targeting specific columns.""" if not columns: return query # Parse the main query parsed_query = self.parse_query(query) # Create column-specific queries column_queries = [] for column in columns: column_queries.append(f'{column}:{parsed_query}') return ' OR '.join(column_queries) def build_phrase_query(self, phrase: str) -> str: """Build FTS5 query for exact phrase matching.""" return f'"{phrase}"' def build_proximity_query(self, terms: List[str], distance: int = 10) -> str: """Build FTS5 NEAR query for proximity searching.""" if len(terms) < 2: return ' '.join(terms) escaped_terms = [self._escape_term(term) for term in terms] return f'NEAR({" ".join(escaped_terms)}, {distance})' def validate_query(self, query: str) -> Tuple[bool, Optional[str]]: """ Validate FTS5 query syntax. Returns: Tuple of (is_valid, error_message) """ if not query or not query.strip(): return False, "Query cannot be empty" # Check for balanced quotes quote_count = query.count('"') if quote_count % 2 != 0: return False, "Unmatched quotes in query" # Check for balanced parentheses open_parens = query.count('(') close_parens = query.count(')') if open_parens != close_parens: return False, "Unmatched parentheses in query" # Check for empty operators for operator in self.fts_operators: if f' {operator} ' in query.upper(): # Make sure operator isn't at start or end if query.upper().startswith(f'{operator} ') or query.upper().endswith(f' {operator}'): return False, f"Operator {operator} cannot be at start or end of query" return True, None def get_query_terms(self, query: str) -> List[str]: """Extract individual search terms from query.""" # Parse query and extract terms parsed = self.parse_query(query) # Remove operators and special syntax terms = [] tokens = self._tokenize_query(parsed) for token in tokens: token = token.strip() if not token: continue # Skip operators if token.upper() in self.fts_operators: continue # Remove NOT prefix if token.upper().startswith('NOT '): token = token[4:] # Remove quotes token = token.strip('"') # Remove prefix wildcard token = token.rstrip('*') # Remove column specification if ':' in token: token = token.split(':', 1)[1] if token and len(token) > 1: terms.append(token.lower()) return list(set(terms)) # Remove duplicates def suggest_corrections(self, query: str, available_terms: List[str]) -> List[str]: """Suggest query corrections based on available terms.""" suggestions = [] query_terms = self.get_query_terms(query) for term in query_terms: # Find similar terms using simple string matching matches = [] for available in available_terms: if available.lower().startswith(term.lower()): matches.append(available) elif term.lower() in available.lower(): matches.append(available) if matches: suggestions.extend(matches[:3]) # Limit suggestions return list(set(suggestions))[:5] # Return top 5 unique suggestions