Files
markitect-main/markitect/query_paradigms/paradigms/natural_language_paradigm.py
tegwick 5143864a86 feat: implement comprehensive query paradigm zoo system (issue #62)
- Created extensible BaseQueryParadigm interface with standardized QueryResult format
- Implemented QueryParadigmRegistry for paradigm discovery and management
- Added 5 working paradigms: SQL, FTS, GraphQL, JSONPath, Natural Language
- Documented 9 additional paradigms: QBE, Batch Manipulation, Visual Query Builder, REST API, NoSQL, UNIX Pipeline, XPath/XQuery, RAG, Data Transformation
- Integrated full CLI interface: list, search, show, exec, categories commands
- Added comprehensive test suite with 23 test cases covering all components
- Auto-registration system enables easy addition of new paradigms
- Organized paradigms by category (structural, textual, semantic, visual, procedural, network) and complexity (beginner, intermediate, advanced)

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-03 23:06:57 +02:00

400 lines
14 KiB
Python

"""
Natural Language Query Paradigm - Human-friendly query interface.
"""
import time
import re
from typing import Dict, Any, List, Optional
from ..base import BaseQueryParadigm, QueryResult
class NaturalLanguageQueryParadigm(BaseQueryParadigm):
"""Natural language query paradigm for intuitive, human-friendly queries."""
@property
def name(self) -> str:
return "Natural Language"
@property
def description(self) -> str:
return "Human-friendly queries that translate to appropriate technical paradigms"
@property
def category(self) -> str:
return "semantic"
@property
def complexity(self) -> str:
return "beginner"
def execute(self, query: str, config: Dict[str, Any] = None) -> QueryResult:
"""Execute natural language query by translating to appropriate paradigm."""
start_time = time.time()
try:
# Analyze the query and determine the best paradigm
best_paradigm, translated_query = self._analyze_and_translate(query)
if not best_paradigm or not translated_query:
raise ValueError(f"Could not understand query: '{query}'")
# Import the appropriate paradigm
from ..registry import registry
paradigm_instance = registry.get(best_paradigm)
if not paradigm_instance:
raise ValueError(f"Paradigm '{best_paradigm}' not available")
# Execute using the target paradigm
result = paradigm_instance.execute(translated_query, config)
# Update result to show it came from natural language
result.paradigm = "Natural Language"
result.metadata.update({
"original_query": query,
"translated_to": best_paradigm,
"translated_query": translated_query,
"query_intent": self._detect_intent(query)
})
execution_time = (time.time() - start_time) * 1000
result.execution_time_ms = execution_time
return result
except Exception as e:
execution_time = (time.time() - start_time) * 1000
return QueryResult(
paradigm="Natural Language",
query=query,
execution_time_ms=execution_time,
result_count=0,
results=[],
metadata={"query_intent": self._detect_intent(query)},
success=False,
error_message=str(e)
)
def get_examples(self) -> List[Dict[str, str]]:
"""Get example natural language queries."""
return [
{
"name": "Find files",
"description": "List and discover files in the system",
"query": "Show me all the files"
},
{
"name": "Search content",
"description": "Search for specific content",
"query": "Find documents about API documentation"
},
{
"name": "Recent activity",
"description": "Find recently modified content",
"query": "What files were created recently?"
},
{
"name": "File statistics",
"description": "Get information about file sizes and counts",
"query": "How many files do I have?"
},
{
"name": "Content analysis",
"description": "Analyze document structure",
"query": "Show me all the headings in the documentation"
},
{
"name": "Schema exploration",
"description": "Discover schemas and their properties",
"query": "What schemas are available?"
},
{
"name": "Large files",
"description": "Find files by size criteria",
"query": "Which files are the largest?"
},
{
"name": "Front matter search",
"description": "Find files with metadata",
"query": "Show files that have front matter"
}
]
def validate_query(self, query: str) -> tuple[bool, Optional[str]]:
"""Validate natural language query."""
if not query or not query.strip():
return False, "Query cannot be empty"
# Natural language queries are generally always valid
# Just check for reasonable length
if len(query.strip()) < 3:
return False, "Query too short - please be more specific"
if len(query) > 500:
return False, "Query too long - please be more concise"
return True, None
def get_syntax_help(self) -> str:
"""Get natural language syntax help."""
return """Natural Language Query Help:
You can ask questions in plain English! The system will automatically
translate your query to the most appropriate technical format.
Common Patterns:
File Discovery:
"Show me all files"
"List the markdown files"
"What files do I have?"
Content Search:
"Find documents about X"
"Search for API documentation"
"Show files containing 'tutorial'"
File Analysis:
"Which files are the largest?"
"Show recent files"
"Find files with front matter"
Structure Analysis:
"Show me all headings"
"Find all code blocks"
"What links are in the files?"
Statistics:
"How many files do I have?"
"What's the total size?"
"Show database statistics"
Schema Queries:
"What schemas are available?"
"Show schema information"
Tips:
- Be specific about what you want to find
- Use natural questions like "What..." or "Show me..."
- Mention specific content types (files, schemas, headings, etc.)
- Use time references like "recent" or "latest"
The system supports various query types and will choose the best
method to answer your question automatically.
"""
def _analyze_and_translate(self, query: str) -> tuple[Optional[str], Optional[str]]:
"""Analyze natural language query and translate to appropriate paradigm."""
query_lower = query.lower().strip()
# Intent detection with paradigm mapping
intent_patterns = [
# Full text search patterns
(r'find.*about|search.*for|documents.*contain|content.*with', 'fts', self._translate_to_fts),
# File listing patterns
(r'show.*files|list.*files|all.*files|files.*have', 'sql', self._translate_to_sql_files),
# Statistics patterns
(r'how many|count|total|statistics|stats', 'sql', self._translate_to_sql_stats),
# Size/analysis patterns
(r'largest|biggest|smallest|size|length', 'sql', self._translate_to_sql_size),
# Recent/time patterns
(r'recent|latest|new|created.*ago|modified', 'sql', self._translate_to_sql_recent),
# Schema patterns
(r'schema|schemas|json.*schema', 'graphql', self._translate_to_graphql_schemas),
# Structure patterns (headings, links, etc.)
(r'heading|headings|links|code.*block|structure', 'jsonpath', self._translate_to_jsonpath),
# Front matter patterns
(r'front.*matter|metadata|yaml.*header', 'sql', self._translate_to_sql_frontmatter),
# General GraphQL patterns
(r'show.*detailed|complete.*information|comprehensive', 'graphql', self._translate_to_graphql_detailed)
]
# Try to match patterns
for pattern, paradigm, translator in intent_patterns:
if re.search(pattern, query_lower):
translated = translator(query)
if translated:
return paradigm, translated
# Fallback: try FTS for any remaining search-like queries
if any(word in query_lower for word in ['find', 'search', 'show', 'get', 'contains']):
translated = self._translate_to_fts(query)
if translated:
return 'fts', translated
return None, None
def _detect_intent(self, query: str) -> str:
"""Detect the intent of the natural language query."""
query_lower = query.lower()
if any(word in query_lower for word in ['find', 'search', 'about', 'contain']):
return "content_search"
elif any(word in query_lower for word in ['list', 'show', 'all', 'files']):
return "file_listing"
elif any(word in query_lower for word in ['count', 'how many', 'statistics']):
return "statistics"
elif any(word in query_lower for word in ['recent', 'latest', 'new']):
return "temporal_query"
elif any(word in query_lower for word in ['large', 'big', 'small', 'size']):
return "size_analysis"
elif any(word in query_lower for word in ['schema', 'schemas']):
return "schema_query"
elif any(word in query_lower for word in ['heading', 'structure', 'link']):
return "structure_analysis"
else:
return "general_query"
def _translate_to_fts(self, query: str) -> Optional[str]:
"""Translate to full text search query."""
query_lower = query.lower()
# Extract search terms
search_terms = []
# Look for "about X" or "containing X"
about_match = re.search(r'about\s+(.+?)(?:\s+in|\s+from|$)', query_lower)
if about_match:
search_terms.append(about_match.group(1))
contain_match = re.search(r'contain(?:ing)?\s+["\']?(.+?)["\']?(?:\s+|$)', query_lower)
if contain_match:
search_terms.append(contain_match.group(1))
for_match = re.search(r'(?:search\s+)?for\s+(.+?)(?:\s+in|\s+from|$)', query_lower)
if for_match:
search_terms.append(for_match.group(1))
# Clean up search terms
if search_terms:
term = search_terms[0].strip(' "\'')
# Remove common stop words
stop_words = ['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']
words = [w for w in term.split() if w.lower() not in stop_words and len(w) > 2]
if words:
return ' AND '.join(words)
# Fallback: extract meaningful words from the entire query
meaningful_words = []
for word in query.split():
word_clean = re.sub(r'[^\w]', '', word).lower()
if (len(word_clean) > 3 and
word_clean not in ['find', 'search', 'show', 'documents', 'files', 'about', 'containing']):
meaningful_words.append(word_clean)
if meaningful_words:
return ' AND '.join(meaningful_words[:3]) # Limit to 3 terms
return None
def _translate_to_sql_files(self, query: str) -> Optional[str]:
"""Translate to SQL file listing query."""
query_lower = query.lower()
if any(phrase in query_lower for phrase in ['all files', 'show files', 'list files']):
return "SELECT id, filename, created_at FROM markdown_files ORDER BY created_at DESC LIMIT 20"
return "SELECT filename FROM markdown_files ORDER BY filename"
def _translate_to_sql_stats(self, query: str) -> Optional[str]:
"""Translate to SQL statistics query."""
query_lower = query.lower()
if 'files' in query_lower:
return "SELECT COUNT(*) as file_count FROM markdown_files"
elif 'schema' in query_lower:
return "SELECT COUNT(*) as schema_count FROM schemas"
else:
return "SELECT (SELECT COUNT(*) FROM markdown_files) as files, (SELECT COUNT(*) FROM schemas) as schemas"
def _translate_to_sql_size(self, query: str) -> Optional[str]:
"""Translate to SQL size/length query."""
query_lower = query.lower()
if any(word in query_lower for word in ['largest', 'biggest']):
return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size DESC LIMIT 10"
elif any(word in query_lower for word in ['smallest', 'small']):
return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size ASC LIMIT 10"
else:
return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size DESC LIMIT 10"
def _translate_to_sql_recent(self, query: str) -> Optional[str]:
"""Translate to SQL recent files query."""
return "SELECT filename, created_at FROM markdown_files WHERE created_at > datetime('now', '-7 days') ORDER BY created_at DESC"
def _translate_to_sql_frontmatter(self, query: str) -> Optional[str]:
"""Translate to SQL front matter query."""
return "SELECT filename, front_matter FROM markdown_files WHERE front_matter IS NOT NULL AND front_matter != '{}'"
def _translate_to_graphql_schemas(self, query: str) -> Optional[str]:
"""Translate to GraphQL schema query."""
return """query {
schemas {
filename
title
description
schemaVersion
propertyCount
}
}"""
def _translate_to_graphql_detailed(self, query: str) -> Optional[str]:
"""Translate to detailed GraphQL query."""
query_lower = query.lower()
if 'file' in query_lower:
return """query {
markdownFiles(limit: 10) {
id
filename
wordCount
lineCount
frontMatter {
key
value
}
createdAt
}
}"""
else:
return """query {
databaseStats {
totalFiles
totalSchemas
totalSizeBytes
lastUpdated
}
}"""
def _translate_to_jsonpath(self, query: str) -> Optional[str]:
"""Translate to JSONPath query."""
query_lower = query.lower()
if 'heading' in query_lower:
return "$..heading"
elif 'link' in query_lower:
return "$..link"
elif 'code' in query_lower:
return "$..code_block"
elif 'image' in query_lower:
return "$..image"
else:
return "$..heading" # Default to headings
def can_translate_from(self, other_paradigm: str) -> bool:
"""Natural language doesn't translate from other paradigms."""
return False
def translate_query(self, query: str, from_paradigm: str) -> Optional[str]:
"""Natural language doesn't translate from other paradigms."""
return None