feat: implement comprehensive query paradigm zoo system (issue #62)

- Created extensible BaseQueryParadigm interface with standardized QueryResult format - Implemented QueryParadigmRegistry for paradigm discovery and management - Added 5 working paradigms: SQL, FTS, GraphQL, JSONPath, Natural Language - Documented 9 additional paradigms: QBE, Batch Manipulation, Visual Query Builder, REST API, NoSQL, UNIX Pipeline, XPath/XQuery, RAG, Data Transformation - Integrated full CLI interface: list, search, show, exec, categories commands - Added comprehensive test suite with 23 test cases covering all components - Auto-registration system enables easy addition of new paradigms - Organized paradigms by category (structural, textual, semantic, visual, procedural, network) and complexity (beginner, intermediate, advanced) 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-03 23:06:57 +02:00
parent 1d13cbb355
commit 5143864a86
21 changed files with 3659 additions and 0 deletions
--- a/markitect/query_paradigms/paradigms/natural_language_paradigm.py
+++ b/markitect/query_paradigms/paradigms/natural_language_paradigm.py
@@ -0,0 +1,400 @@
+"""
+Natural Language Query Paradigm - Human-friendly query interface.
+"""
+
+import time
+import re
+from typing import Dict, Any, List, Optional
+
+from ..base import BaseQueryParadigm, QueryResult
+
+
+class NaturalLanguageQueryParadigm(BaseQueryParadigm):
+    """Natural language query paradigm for intuitive, human-friendly queries."""
+
+    @property
+    def name(self) -> str:
+        return "Natural Language"
+
+    @property
+    def description(self) -> str:
+        return "Human-friendly queries that translate to appropriate technical paradigms"
+
+    @property
+    def category(self) -> str:
+        return "semantic"
+
+    @property
+    def complexity(self) -> str:
+        return "beginner"
+
+    def execute(self, query: str, config: Dict[str, Any] = None) -> QueryResult:
+        """Execute natural language query by translating to appropriate paradigm."""
+        start_time = time.time()
+
+        try:
+            # Analyze the query and determine the best paradigm
+            best_paradigm, translated_query = self._analyze_and_translate(query)
+
+            if not best_paradigm or not translated_query:
+                raise ValueError(f"Could not understand query: '{query}'")
+
+            # Import the appropriate paradigm
+            from ..registry import registry
+            paradigm_instance = registry.get(best_paradigm)
+
+            if not paradigm_instance:
+                raise ValueError(f"Paradigm '{best_paradigm}' not available")
+
+            # Execute using the target paradigm
+            result = paradigm_instance.execute(translated_query, config)
+
+            # Update result to show it came from natural language
+            result.paradigm = "Natural Language"
+            result.metadata.update({
+                "original_query": query,
+                "translated_to": best_paradigm,
+                "translated_query": translated_query,
+                "query_intent": self._detect_intent(query)
+            })
+
+            execution_time = (time.time() - start_time) * 1000
+            result.execution_time_ms = execution_time
+
+            return result
+
+        except Exception as e:
+            execution_time = (time.time() - start_time) * 1000
+
+            return QueryResult(
+                paradigm="Natural Language",
+                query=query,
+                execution_time_ms=execution_time,
+                result_count=0,
+                results=[],
+                metadata={"query_intent": self._detect_intent(query)},
+                success=False,
+                error_message=str(e)
+            )
+
+    def get_examples(self) -> List[Dict[str, str]]:
+        """Get example natural language queries."""
+        return [
+            {
+                "name": "Find files",
+                "description": "List and discover files in the system",
+                "query": "Show me all the files"
+            },
+            {
+                "name": "Search content",
+                "description": "Search for specific content",
+                "query": "Find documents about API documentation"
+            },
+            {
+                "name": "Recent activity",
+                "description": "Find recently modified content",
+                "query": "What files were created recently?"
+            },
+            {
+                "name": "File statistics",
+                "description": "Get information about file sizes and counts",
+                "query": "How many files do I have?"
+            },
+            {
+                "name": "Content analysis",
+                "description": "Analyze document structure",
+                "query": "Show me all the headings in the documentation"
+            },
+            {
+                "name": "Schema exploration",
+                "description": "Discover schemas and their properties",
+                "query": "What schemas are available?"
+            },
+            {
+                "name": "Large files",
+                "description": "Find files by size criteria",
+                "query": "Which files are the largest?"
+            },
+            {
+                "name": "Front matter search",
+                "description": "Find files with metadata",
+                "query": "Show files that have front matter"
+            }
+        ]
+
+    def validate_query(self, query: str) -> tuple[bool, Optional[str]]:
+        """Validate natural language query."""
+        if not query or not query.strip():
+            return False, "Query cannot be empty"
+
+        # Natural language queries are generally always valid
+        # Just check for reasonable length
+        if len(query.strip()) < 3:
+            return False, "Query too short - please be more specific"
+
+        if len(query) > 500:
+            return False, "Query too long - please be more concise"
+
+        return True, None
+
+    def get_syntax_help(self) -> str:
+        """Get natural language syntax help."""
+        return """Natural Language Query Help:
+
+You can ask questions in plain English! The system will automatically
+translate your query to the most appropriate technical format.
+
+Common Patterns:
+
+File Discovery:
+  "Show me all files"
+  "List the markdown files"
+  "What files do I have?"
+
+Content Search:
+  "Find documents about X"
+  "Search for API documentation"
+  "Show files containing 'tutorial'"
+
+File Analysis:
+  "Which files are the largest?"
+  "Show recent files"
+  "Find files with front matter"
+
+Structure Analysis:
+  "Show me all headings"
+  "Find all code blocks"
+  "What links are in the files?"
+
+Statistics:
+  "How many files do I have?"
+  "What's the total size?"
+  "Show database statistics"
+
+Schema Queries:
+  "What schemas are available?"
+  "Show schema information"
+
+Tips:
+- Be specific about what you want to find
+- Use natural questions like "What..." or "Show me..."
+- Mention specific content types (files, schemas, headings, etc.)
+- Use time references like "recent" or "latest"
+
+The system supports various query types and will choose the best
+method to answer your question automatically.
+"""
+
+    def _analyze_and_translate(self, query: str) -> tuple[Optional[str], Optional[str]]:
+        """Analyze natural language query and translate to appropriate paradigm."""
+        query_lower = query.lower().strip()
+
+        # Intent detection with paradigm mapping
+        intent_patterns = [
+            # Full text search patterns
+            (r'find.*about|search.*for|documents.*contain|content.*with', 'fts', self._translate_to_fts),
+
+            # File listing patterns
+            (r'show.*files|list.*files|all.*files|files.*have', 'sql', self._translate_to_sql_files),
+
+            # Statistics patterns
+            (r'how many|count|total|statistics|stats', 'sql', self._translate_to_sql_stats),
+
+            # Size/analysis patterns
+            (r'largest|biggest|smallest|size|length', 'sql', self._translate_to_sql_size),
+
+            # Recent/time patterns
+            (r'recent|latest|new|created.*ago|modified', 'sql', self._translate_to_sql_recent),
+
+            # Schema patterns
+            (r'schema|schemas|json.*schema', 'graphql', self._translate_to_graphql_schemas),
+
+            # Structure patterns (headings, links, etc.)
+            (r'heading|headings|links|code.*block|structure', 'jsonpath', self._translate_to_jsonpath),
+
+            # Front matter patterns
+            (r'front.*matter|metadata|yaml.*header', 'sql', self._translate_to_sql_frontmatter),
+
+            # General GraphQL patterns
+            (r'show.*detailed|complete.*information|comprehensive', 'graphql', self._translate_to_graphql_detailed)
+        ]
+
+        # Try to match patterns
+        for pattern, paradigm, translator in intent_patterns:
+            if re.search(pattern, query_lower):
+                translated = translator(query)
+                if translated:
+                    return paradigm, translated
+
+        # Fallback: try FTS for any remaining search-like queries
+        if any(word in query_lower for word in ['find', 'search', 'show', 'get', 'contains']):
+            translated = self._translate_to_fts(query)
+            if translated:
+                return 'fts', translated
+
+        return None, None
+
+    def _detect_intent(self, query: str) -> str:
+        """Detect the intent of the natural language query."""
+        query_lower = query.lower()
+
+        if any(word in query_lower for word in ['find', 'search', 'about', 'contain']):
+            return "content_search"
+        elif any(word in query_lower for word in ['list', 'show', 'all', 'files']):
+            return "file_listing"
+        elif any(word in query_lower for word in ['count', 'how many', 'statistics']):
+            return "statistics"
+        elif any(word in query_lower for word in ['recent', 'latest', 'new']):
+            return "temporal_query"
+        elif any(word in query_lower for word in ['large', 'big', 'small', 'size']):
+            return "size_analysis"
+        elif any(word in query_lower for word in ['schema', 'schemas']):
+            return "schema_query"
+        elif any(word in query_lower for word in ['heading', 'structure', 'link']):
+            return "structure_analysis"
+        else:
+            return "general_query"
+
+    def _translate_to_fts(self, query: str) -> Optional[str]:
+        """Translate to full text search query."""
+        query_lower = query.lower()
+
+        # Extract search terms
+        search_terms = []
+
+        # Look for "about X" or "containing X"
+        about_match = re.search(r'about\s+(.+?)(?:\s+in|\s+from|$)', query_lower)
+        if about_match:
+            search_terms.append(about_match.group(1))
+
+        contain_match = re.search(r'contain(?:ing)?\s+["\']?(.+?)["\']?(?:\s+|$)', query_lower)
+        if contain_match:
+            search_terms.append(contain_match.group(1))
+
+        for_match = re.search(r'(?:search\s+)?for\s+(.+?)(?:\s+in|\s+from|$)', query_lower)
+        if for_match:
+            search_terms.append(for_match.group(1))
+
+        # Clean up search terms
+        if search_terms:
+            term = search_terms[0].strip(' "\'')
+            # Remove common stop words
+            stop_words = ['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']
+            words = [w for w in term.split() if w.lower() not in stop_words and len(w) > 2]
+            if words:
+                return ' AND '.join(words)
+
+        # Fallback: extract meaningful words from the entire query
+        meaningful_words = []
+        for word in query.split():
+            word_clean = re.sub(r'[^\w]', '', word).lower()
+            if (len(word_clean) > 3 and
+                word_clean not in ['find', 'search', 'show', 'documents', 'files', 'about', 'containing']):
+                meaningful_words.append(word_clean)
+
+        if meaningful_words:
+            return ' AND '.join(meaningful_words[:3])  # Limit to 3 terms
+
+        return None
+
+    def _translate_to_sql_files(self, query: str) -> Optional[str]:
+        """Translate to SQL file listing query."""
+        query_lower = query.lower()
+
+        if any(phrase in query_lower for phrase in ['all files', 'show files', 'list files']):
+            return "SELECT id, filename, created_at FROM markdown_files ORDER BY created_at DESC LIMIT 20"
+
+        return "SELECT filename FROM markdown_files ORDER BY filename"
+
+    def _translate_to_sql_stats(self, query: str) -> Optional[str]:
+        """Translate to SQL statistics query."""
+        query_lower = query.lower()
+
+        if 'files' in query_lower:
+            return "SELECT COUNT(*) as file_count FROM markdown_files"
+        elif 'schema' in query_lower:
+            return "SELECT COUNT(*) as schema_count FROM schemas"
+        else:
+            return "SELECT (SELECT COUNT(*) FROM markdown_files) as files, (SELECT COUNT(*) FROM schemas) as schemas"
+
+    def _translate_to_sql_size(self, query: str) -> Optional[str]:
+        """Translate to SQL size/length query."""
+        query_lower = query.lower()
+
+        if any(word in query_lower for word in ['largest', 'biggest']):
+            return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size DESC LIMIT 10"
+        elif any(word in query_lower for word in ['smallest', 'small']):
+            return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size ASC LIMIT 10"
+        else:
+            return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size DESC LIMIT 10"
+
+    def _translate_to_sql_recent(self, query: str) -> Optional[str]:
+        """Translate to SQL recent files query."""
+        return "SELECT filename, created_at FROM markdown_files WHERE created_at > datetime('now', '-7 days') ORDER BY created_at DESC"
+
+    def _translate_to_sql_frontmatter(self, query: str) -> Optional[str]:
+        """Translate to SQL front matter query."""
+        return "SELECT filename, front_matter FROM markdown_files WHERE front_matter IS NOT NULL AND front_matter != '{}'"
+
+    def _translate_to_graphql_schemas(self, query: str) -> Optional[str]:
+        """Translate to GraphQL schema query."""
+        return """query {
+  schemas {
+    filename
+    title
+    description
+    schemaVersion
+    propertyCount
+  }
+}"""
+
+    def _translate_to_graphql_detailed(self, query: str) -> Optional[str]:
+        """Translate to detailed GraphQL query."""
+        query_lower = query.lower()
+
+        if 'file' in query_lower:
+            return """query {
+  markdownFiles(limit: 10) {
+    id
+    filename
+    wordCount
+    lineCount
+    frontMatter {
+      key
+      value
+    }
+    createdAt
+  }
+}"""
+        else:
+            return """query {
+  databaseStats {
+    totalFiles
+    totalSchemas
+    totalSizeBytes
+    lastUpdated
+  }
+}"""
+
+    def _translate_to_jsonpath(self, query: str) -> Optional[str]:
+        """Translate to JSONPath query."""
+        query_lower = query.lower()
+
+        if 'heading' in query_lower:
+            return "$..heading"
+        elif 'link' in query_lower:
+            return "$..link"
+        elif 'code' in query_lower:
+            return "$..code_block"
+        elif 'image' in query_lower:
+            return "$..image"
+        else:
+            return "$..heading"  # Default to headings
+
+    def can_translate_from(self, other_paradigm: str) -> bool:
+        """Natural language doesn't translate from other paradigms."""
+        return False
+
+    def translate_query(self, query: str, from_paradigm: str) -> Optional[str]:
+        """Natural language doesn't translate from other paradigms."""
+        return None