feat: implement lightweight full text search plugin using SQLite FTS5 (issue #83)

Added comprehensive full text search capabilities as a lightweight plugin. Key features: - SQLite FTS5-based search engine with no external dependencies - Automatic indexing via database triggers for real-time updates - Advanced query support: phrase search, boolean operators, proximity search - Complete CLI interface with search commands - Graceful fallback to LIKE queries when FTS5 unavailable - Plugin architecture integration for extensibility CLI Commands: - `markitect search init` - Initialize search indexes - `markitect search query` - Perform full text searches - `markitect search status` - View index statistics - `markitect search rebuild` - Rebuild indexes from scratch Search Features: - Content type filtering (files, schemas, all) - Result pagination and formatting options - Query validation and syntax assistance - Performance optimization and index maintenance Technical Implementation: - FTSSearchPlugin: Main search plugin class - SearchIndexer: FTS5 table management and indexing - QueryParser: Query optimization and FTS5 syntax conversion - Comprehensive error handling and fallback mechanisms - 25 test cases covering all functionality Documentation includes complete usage guide and examples. Resolves issue #83: Full text search 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-03 17:03:11 +02:00
parent 2a15dde228
commit 8179929a4a
7 changed files with 1994 additions and 0 deletions
--- a/markitect/plugins/builtin/search/query_parser.py
+++ b/markitect/plugins/builtin/search/query_parser.py
@@ -0,0 +1,273 @@
+"""
+Query parsing and processing for FTS5 full text search.
+
+Handles converting user queries into FTS5-compatible syntax and provides
+query validation and enhancement features.
+"""
+
+import re
+from typing import List, Dict, Any, Optional, Tuple
+
+
+class QueryParser:
+    """Parses and processes search queries for FTS5."""
+
+    def __init__(self):
+        # FTS5 operators and syntax
+        self.fts_operators = ['AND', 'OR', 'NOT', 'NEAR']
+        self.fts_special_chars = ['"', '*', '^', '(', ')']
+
+    def parse_query(self, query: str) -> str:
+        """
+        Parse and convert user query to FTS5-compatible syntax.
+
+        Args:
+            query: Raw user search query
+
+        Returns:
+            FTS5-compatible query string
+        """
+        if not query or not query.strip():
+            return ""
+
+        # Clean and normalize the query
+        query = query.strip()
+
+        # If query is already using FTS5 syntax, return as-is
+        if self._is_fts5_query(query):
+            return query
+
+        # Convert natural language query to FTS5
+        return self._convert_to_fts5(query)
+
+    def _is_fts5_query(self, query: str) -> bool:
+        """Check if query already uses FTS5 syntax."""
+        # Look for FTS5 operators or special syntax
+        for operator in self.fts_operators:
+            if f' {operator} ' in query.upper():
+                return True
+
+        # Look for quoted phrases
+        if '"' in query:
+            return True
+
+        # Look for prefix matching
+        if '*' in query:
+            return True
+
+        # Look for column specifications
+        if ':' in query:
+            return True
+
+        return False
+
+    def _convert_to_fts5(self, query: str) -> str:
+        """Convert natural language query to FTS5 syntax."""
+        # Handle quoted phrases - preserve them
+        phrases = []
+        phrase_pattern = r'"([^"]*)"'
+
+        def preserve_phrase(match):
+            phrases.append(match.group(0))
+            return f"__PHRASE_{len(phrases) - 1}__"
+
+        query = re.sub(phrase_pattern, preserve_phrase, query)
+
+        # Split into words, preserving operators
+        words = self._tokenize_query(query)
+
+        # Process each word
+        processed_words = []
+        i = 0
+        while i < len(words):
+            word = words[i].strip()
+
+            if not word:
+                i += 1
+                continue
+
+            # Restore preserved phrases
+            if word.startswith("__PHRASE_"):
+                phrase_index = int(word.replace("__PHRASE_", "").replace("__", ""))
+                processed_words.append(phrases[phrase_index])
+                i += 1
+                continue
+
+            # Handle negation (convert "not" to NOT)
+            if word.lower() in ['not', '-']:
+                if i + 1 < len(words):
+                    next_word = words[i + 1].strip()
+                    if next_word and not next_word.upper() in self.fts_operators:
+                        processed_words.append(f'NOT {self._escape_term(next_word)}')
+                        i += 2
+                        continue
+
+            # Handle AND/OR operators
+            if word.upper() in self.fts_operators:
+                processed_words.append(word.upper())
+                i += 1
+                continue
+
+            # Handle prefix matching (add * for partial matches)
+            if len(word) >= 3 and word.isalnum():
+                processed_words.append(f'{self._escape_term(word)}*')
+            else:
+                processed_words.append(self._escape_term(word))
+
+            i += 1
+
+        # Join with spaces, but add AND between terms if no operator specified
+        result_parts = []
+        for i, part in enumerate(processed_words):
+            if i > 0 and part.upper() not in self.fts_operators:
+                prev_part = processed_words[i - 1]
+                if prev_part.upper() not in self.fts_operators and not prev_part.startswith('NOT'):
+                    result_parts.append('AND')
+
+            result_parts.append(part)
+
+        return ' '.join(result_parts)
+
+    def _tokenize_query(self, query: str) -> List[str]:
+        """Tokenize query into words and operators."""
+        # Split on whitespace but preserve quoted content
+        tokens = []
+        current_token = ""
+        in_quotes = False
+
+        for char in query:
+            if char == '"':
+                in_quotes = not in_quotes
+                current_token += char
+            elif char.isspace() and not in_quotes:
+                if current_token:
+                    tokens.append(current_token)
+                    current_token = ""
+            else:
+                current_token += char
+
+        if current_token:
+            tokens.append(current_token)
+
+        return tokens
+
+    def _escape_term(self, term: str) -> str:
+        """Escape special characters in search terms."""
+        # Escape FTS5 special characters
+        for char in ['"']:
+            term = term.replace(char, '\\' + char)
+
+        return term
+
+    def build_column_query(self, query: str, columns: List[str]) -> str:
+        """Build FTS5 query targeting specific columns."""
+        if not columns:
+            return query
+
+        # Parse the main query
+        parsed_query = self.parse_query(query)
+
+        # Create column-specific queries
+        column_queries = []
+        for column in columns:
+            column_queries.append(f'{column}:{parsed_query}')
+
+        return ' OR '.join(column_queries)
+
+    def build_phrase_query(self, phrase: str) -> str:
+        """Build FTS5 query for exact phrase matching."""
+        return f'"{phrase}"'
+
+    def build_proximity_query(self, terms: List[str], distance: int = 10) -> str:
+        """Build FTS5 NEAR query for proximity searching."""
+        if len(terms) < 2:
+            return ' '.join(terms)
+
+        escaped_terms = [self._escape_term(term) for term in terms]
+        return f'NEAR({" ".join(escaped_terms)}, {distance})'
+
+    def validate_query(self, query: str) -> Tuple[bool, Optional[str]]:
+        """
+        Validate FTS5 query syntax.
+
+        Returns:
+            Tuple of (is_valid, error_message)
+        """
+        if not query or not query.strip():
+            return False, "Query cannot be empty"
+
+        # Check for balanced quotes
+        quote_count = query.count('"')
+        if quote_count % 2 != 0:
+            return False, "Unmatched quotes in query"
+
+        # Check for balanced parentheses
+        open_parens = query.count('(')
+        close_parens = query.count(')')
+        if open_parens != close_parens:
+            return False, "Unmatched parentheses in query"
+
+        # Check for empty operators
+        for operator in self.fts_operators:
+            if f' {operator} ' in query.upper():
+                # Make sure operator isn't at start or end
+                if query.upper().startswith(f'{operator} ') or query.upper().endswith(f' {operator}'):
+                    return False, f"Operator {operator} cannot be at start or end of query"
+
+        return True, None
+
+    def get_query_terms(self, query: str) -> List[str]:
+        """Extract individual search terms from query."""
+        # Parse query and extract terms
+        parsed = self.parse_query(query)
+
+        # Remove operators and special syntax
+        terms = []
+        tokens = self._tokenize_query(parsed)
+
+        for token in tokens:
+            token = token.strip()
+            if not token:
+                continue
+
+            # Skip operators
+            if token.upper() in self.fts_operators:
+                continue
+
+            # Remove NOT prefix
+            if token.upper().startswith('NOT '):
+                token = token[4:]
+
+            # Remove quotes
+            token = token.strip('"')
+
+            # Remove prefix wildcard
+            token = token.rstrip('*')
+
+            # Remove column specification
+            if ':' in token:
+                token = token.split(':', 1)[1]
+
+            if token and len(token) > 1:
+                terms.append(token.lower())
+
+        return list(set(terms))  # Remove duplicates
+
+    def suggest_corrections(self, query: str, available_terms: List[str]) -> List[str]:
+        """Suggest query corrections based on available terms."""
+        suggestions = []
+        query_terms = self.get_query_terms(query)
+
+        for term in query_terms:
+            # Find similar terms using simple string matching
+            matches = []
+            for available in available_terms:
+                if available.lower().startswith(term.lower()):
+                    matches.append(available)
+                elif term.lower() in available.lower():
+                    matches.append(available)
+
+            if matches:
+                suggestions.extend(matches[:3])  # Limit suggestions
+
+        return list(set(suggestions))[:5]  # Return top 5 unique suggestions