markitect-main/markitect/plugins/builtin/search/query_parser.py

"""
Query parsing and processing for FTS5 full text search.

Handles converting user queries into FTS5-compatible syntax and provides
query validation and enhancement features.
"""

import re
from typing import List, Dict, Any, Optional, Tuple


class QueryParser:
    """Parses and processes search queries for FTS5."""

    def __init__(self):
        # FTS5 operators and syntax
        self.fts_operators = ['AND', 'OR', 'NOT', 'NEAR']
        self.fts_special_chars = ['"', '*', '^', '(', ')']

    def parse_query(self, query: str) -> str:
        """
        Parse and convert user query to FTS5-compatible syntax.

        Args:
            query: Raw user search query

        Returns:
            FTS5-compatible query string
        """
        if not query or not query.strip():
            return ""

        # Clean and normalize the query
        query = query.strip()

        # If query is already using FTS5 syntax, return as-is
        if self._is_fts5_query(query):
            return query

        # Convert natural language query to FTS5
        return self._convert_to_fts5(query)

    def _is_fts5_query(self, query: str) -> bool:
        """Check if query already uses FTS5 syntax."""
        # Look for FTS5 operators or special syntax
        for operator in self.fts_operators:
            if f' {operator} ' in query.upper():
                return True

        # Look for quoted phrases
        if '"' in query:
            return True

        # Look for prefix matching
        if '*' in query:
            return True

        # Look for column specifications
        if ':' in query:
            return True

        return False

    def _convert_to_fts5(self, query: str) -> str:
        """Convert natural language query to FTS5 syntax."""
        # Handle quoted phrases - preserve them
        phrases = []
        phrase_pattern = r'"([^"]*)"'

        def preserve_phrase(match):
            phrases.append(match.group(0))
            return f"__PHRASE_{len(phrases) - 1}__"

        query = re.sub(phrase_pattern, preserve_phrase, query)

        # Split into words, preserving operators
        words = self._tokenize_query(query)

        # Process each word
        processed_words = []
        i = 0
        while i < len(words):
            word = words[i].strip()

            if not word:
                i += 1
                continue

            # Restore preserved phrases
            if word.startswith("__PHRASE_"):
                phrase_index = int(word.replace("__PHRASE_", "").replace("__", ""))
                processed_words.append(phrases[phrase_index])
                i += 1
                continue

            # Handle negation (convert "not" to NOT)
            if word.lower() in ['not', '-']:
                if i + 1 < len(words):
                    next_word = words[i + 1].strip()
                    if next_word and not next_word.upper() in self.fts_operators:
                        processed_words.append(f'NOT {self._escape_term(next_word)}')
                        i += 2
                        continue

            # Handle AND/OR operators
            if word.upper() in self.fts_operators:
                processed_words.append(word.upper())
                i += 1
                continue

            # Handle prefix matching (add * for partial matches)
            if len(word) >= 3 and word.isalnum():
                processed_words.append(f'{self._escape_term(word)}*')
            else:
                processed_words.append(self._escape_term(word))

            i += 1

        # Join with spaces, but add AND between terms if no operator specified
        result_parts = []
        for i, part in enumerate(processed_words):
            if i > 0 and part.upper() not in self.fts_operators:
                prev_part = processed_words[i - 1]
                if prev_part.upper() not in self.fts_operators and not prev_part.startswith('NOT'):
                    result_parts.append('AND')

            result_parts.append(part)

        return ' '.join(result_parts)

    def _tokenize_query(self, query: str) -> List[str]:
        """Tokenize query into words and operators."""
        # Split on whitespace but preserve quoted content
        tokens = []
        current_token = ""
        in_quotes = False

        for char in query:
            if char == '"':
                in_quotes = not in_quotes
                current_token += char
            elif char.isspace() and not in_quotes:
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
            else:
                current_token += char

        if current_token:
            tokens.append(current_token)

        return tokens

    def _escape_term(self, term: str) -> str:
        """Escape special characters in search terms."""
        # Escape FTS5 special characters
        for char in ['"']:
            term = term.replace(char, '\\' + char)

        return term

    def build_column_query(self, query: str, columns: List[str]) -> str:
        """Build FTS5 query targeting specific columns."""
        if not columns:
            return query

        # Parse the main query
        parsed_query = self.parse_query(query)

        # Create column-specific queries
        column_queries = []
        for column in columns:
            column_queries.append(f'{column}:{parsed_query}')

        return ' OR '.join(column_queries)

    def build_phrase_query(self, phrase: str) -> str:
        """Build FTS5 query for exact phrase matching."""
        return f'"{phrase}"'

    def build_proximity_query(self, terms: List[str], distance: int = 10) -> str:
        """Build FTS5 NEAR query for proximity searching."""
        if len(terms) < 2:
            return ' '.join(terms)

        escaped_terms = [self._escape_term(term) for term in terms]
        return f'NEAR({" ".join(escaped_terms)}, {distance})'

    def validate_query(self, query: str) -> Tuple[bool, Optional[str]]:
        """
        Validate FTS5 query syntax.

        Returns:
            Tuple of (is_valid, error_message)
        """
        if not query or not query.strip():
            return False, "Query cannot be empty"

        # Check for balanced quotes
        quote_count = query.count('"')
        if quote_count % 2 != 0:
            return False, "Unmatched quotes in query"

        # Check for balanced parentheses
        open_parens = query.count('(')
        close_parens = query.count(')')
        if open_parens != close_parens:
            return False, "Unmatched parentheses in query"

        # Check for empty operators
        for operator in self.fts_operators:
            if f' {operator} ' in query.upper():
                # Make sure operator isn't at start or end
                if query.upper().startswith(f'{operator} ') or query.upper().endswith(f' {operator}'):
                    return False, f"Operator {operator} cannot be at start or end of query"

        return True, None

    def get_query_terms(self, query: str) -> List[str]:
        """Extract individual search terms from query."""
        # Parse query and extract terms
        parsed = self.parse_query(query)

        # Remove operators and special syntax
        terms = []
        tokens = self._tokenize_query(parsed)

        for token in tokens:
            token = token.strip()
            if not token:
                continue

            # Skip operators
            if token.upper() in self.fts_operators:
                continue

            # Remove NOT prefix
            if token.upper().startswith('NOT '):
                token = token[4:]

            # Remove quotes
            token = token.strip('"')

            # Remove prefix wildcard
            token = token.rstrip('*')

            # Remove column specification
            if ':' in token:
                token = token.split(':', 1)[1]

            if token and len(token) > 1:
                terms.append(token.lower())

        return list(set(terms))  # Remove duplicates

    def suggest_corrections(self, query: str, available_terms: List[str]) -> List[str]:
        """Suggest query corrections based on available terms."""
        suggestions = []
        query_terms = self.get_query_terms(query)

        for term in query_terms:
            # Find similar terms using simple string matching
            matches = []
            for available in available_terms:
                if available.lower().startswith(term.lower()):
                    matches.append(available)
                elif term.lower() in available.lower():
                    matches.append(available)

            if matches:
                suggestions.extend(matches[:3])  # Limit suggestions

        return list(set(suggestions))[:5]  # Return top 5 unique suggestions