markitect-main/markitect/query_paradigms/paradigms/natural_language_paradigm.py

"""
Natural Language Query Paradigm - Human-friendly query interface.
"""

import time
import re
from typing import Dict, Any, List, Optional

from ..base import BaseQueryParadigm, QueryResult


class NaturalLanguageQueryParadigm(BaseQueryParadigm):
    """Natural language query paradigm for intuitive, human-friendly queries."""

    @property
    def name(self) -> str:
        return "Natural Language"

    @property
    def description(self) -> str:
        return "Human-friendly queries that translate to appropriate technical paradigms"

    @property
    def category(self) -> str:
        return "semantic"

    @property
    def complexity(self) -> str:
        return "beginner"

    def execute(self, query: str, config: Dict[str, Any] = None) -> QueryResult:
        """Execute natural language query by translating to appropriate paradigm."""
        start_time = time.time()

        try:
            # Analyze the query and determine the best paradigm
            best_paradigm, translated_query = self._analyze_and_translate(query)

            if not best_paradigm or not translated_query:
                raise ValueError(f"Could not understand query: '{query}'")

            # Import the appropriate paradigm
            from ..registry import registry
            paradigm_instance = registry.get(best_paradigm)

            if not paradigm_instance:
                raise ValueError(f"Paradigm '{best_paradigm}' not available")

            # Execute using the target paradigm
            result = paradigm_instance.execute(translated_query, config)

            # Update result to show it came from natural language
            result.paradigm = "Natural Language"
            result.metadata.update({
                "original_query": query,
                "translated_to": best_paradigm,
                "translated_query": translated_query,
                "query_intent": self._detect_intent(query)
            })

            execution_time = (time.time() - start_time) * 1000
            result.execution_time_ms = execution_time

            return result

        except Exception as e:
            execution_time = (time.time() - start_time) * 1000

            return QueryResult(
                paradigm="Natural Language",
                query=query,
                execution_time_ms=execution_time,
                result_count=0,
                results=[],
                metadata={"query_intent": self._detect_intent(query)},
                success=False,
                error_message=str(e)
            )

    def get_examples(self) -> List[Dict[str, str]]:
        """Get example natural language queries."""
        return [
            {
                "name": "Find files",
                "description": "List and discover files in the system",
                "query": "Show me all the files"
            },
            {
                "name": "Search content",
                "description": "Search for specific content",
                "query": "Find documents about API documentation"
            },
            {
                "name": "Recent activity",
                "description": "Find recently modified content",
                "query": "What files were created recently?"
            },
            {
                "name": "File statistics",
                "description": "Get information about file sizes and counts",
                "query": "How many files do I have?"
            },
            {
                "name": "Content analysis",
                "description": "Analyze document structure",
                "query": "Show me all the headings in the documentation"
            },
            {
                "name": "Schema exploration",
                "description": "Discover schemas and their properties",
                "query": "What schemas are available?"
            },
            {
                "name": "Large files",
                "description": "Find files by size criteria",
                "query": "Which files are the largest?"
            },
            {
                "name": "Front matter search",
                "description": "Find files with metadata",
                "query": "Show files that have front matter"
            }
        ]

    def validate_query(self, query: str) -> tuple[bool, Optional[str]]:
        """Validate natural language query."""
        if not query or not query.strip():
            return False, "Query cannot be empty"

        # Natural language queries are generally always valid
        # Just check for reasonable length
        if len(query.strip()) < 3:
            return False, "Query too short - please be more specific"

        if len(query) > 500:
            return False, "Query too long - please be more concise"

        return True, None

    def get_syntax_help(self) -> str:
        """Get natural language syntax help."""
        return """Natural Language Query Help:

You can ask questions in plain English! The system will automatically
translate your query to the most appropriate technical format.

Common Patterns:

File Discovery:
  "Show me all files"
  "List the markdown files"
  "What files do I have?"

Content Search:
  "Find documents about X"
  "Search for API documentation"
  "Show files containing 'tutorial'"

File Analysis:
  "Which files are the largest?"
  "Show recent files"
  "Find files with front matter"

Structure Analysis:
  "Show me all headings"
  "Find all code blocks"
  "What links are in the files?"

Statistics:
  "How many files do I have?"
  "What's the total size?"
  "Show database statistics"

Schema Queries:
  "What schemas are available?"
  "Show schema information"

Tips:
- Be specific about what you want to find
- Use natural questions like "What..." or "Show me..."
- Mention specific content types (files, schemas, headings, etc.)
- Use time references like "recent" or "latest"

The system supports various query types and will choose the best
method to answer your question automatically.
"""

    def _analyze_and_translate(self, query: str) -> tuple[Optional[str], Optional[str]]:
        """Analyze natural language query and translate to appropriate paradigm."""
        query_lower = query.lower().strip()

        # Intent detection with paradigm mapping
        intent_patterns = [
            # Full text search patterns
            (r'find.*about|search.*for|documents.*contain|content.*with', 'fts', self._translate_to_fts),

            # File listing patterns
            (r'show.*files|list.*files|all.*files|files.*have', 'sql', self._translate_to_sql_files),

            # Statistics patterns
            (r'how many|count|total|statistics|stats', 'sql', self._translate_to_sql_stats),

            # Size/analysis patterns
            (r'largest|biggest|smallest|size|length', 'sql', self._translate_to_sql_size),

            # Recent/time patterns
            (r'recent|latest|new|created.*ago|modified', 'sql', self._translate_to_sql_recent),

            # Schema patterns
            (r'schema|schemas|json.*schema', 'graphql', self._translate_to_graphql_schemas),

            # Structure patterns (headings, links, etc.)
            (r'heading|headings|links|code.*block|structure', 'jsonpath', self._translate_to_jsonpath),

            # Front matter patterns
            (r'front.*matter|metadata|yaml.*header', 'sql', self._translate_to_sql_frontmatter),

            # General GraphQL patterns
            (r'show.*detailed|complete.*information|comprehensive', 'graphql', self._translate_to_graphql_detailed)
        ]

        # Try to match patterns
        for pattern, paradigm, translator in intent_patterns:
            if re.search(pattern, query_lower):
                translated = translator(query)
                if translated:
                    return paradigm, translated

        # Fallback: try FTS for any remaining search-like queries
        if any(word in query_lower for word in ['find', 'search', 'show', 'get', 'contains']):
            translated = self._translate_to_fts(query)
            if translated:
                return 'fts', translated

        return None, None

    def _detect_intent(self, query: str) -> str:
        """Detect the intent of the natural language query."""
        query_lower = query.lower()

        if any(word in query_lower for word in ['find', 'search', 'about', 'contain']):
            return "content_search"
        elif any(word in query_lower for word in ['list', 'show', 'all', 'files']):
            return "file_listing"
        elif any(word in query_lower for word in ['count', 'how many', 'statistics']):
            return "statistics"
        elif any(word in query_lower for word in ['recent', 'latest', 'new']):
            return "temporal_query"
        elif any(word in query_lower for word in ['large', 'big', 'small', 'size']):
            return "size_analysis"
        elif any(word in query_lower for word in ['schema', 'schemas']):
            return "schema_query"
        elif any(word in query_lower for word in ['heading', 'structure', 'link']):
            return "structure_analysis"
        else:
            return "general_query"

    def _translate_to_fts(self, query: str) -> Optional[str]:
        """Translate to full text search query."""
        query_lower = query.lower()

        # Extract search terms
        search_terms = []

        # Look for "about X" or "containing X"
        about_match = re.search(r'about\s+(.+?)(?:\s+in|\s+from|$)', query_lower)
        if about_match:
            search_terms.append(about_match.group(1))

        contain_match = re.search(r'contain(?:ing)?\s+["\']?(.+?)["\']?(?:\s+|$)', query_lower)
        if contain_match:
            search_terms.append(contain_match.group(1))

        for_match = re.search(r'(?:search\s+)?for\s+(.+?)(?:\s+in|\s+from|$)', query_lower)
        if for_match:
            search_terms.append(for_match.group(1))

        # Clean up search terms
        if search_terms:
            term = search_terms[0].strip(' "\'')
            # Remove common stop words
            stop_words = ['the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by']
            words = [w for w in term.split() if w.lower() not in stop_words and len(w) > 2]
            if words:
                return ' AND '.join(words)

        # Fallback: extract meaningful words from the entire query
        meaningful_words = []
        for word in query.split():
            word_clean = re.sub(r'[^\w]', '', word).lower()
            if (len(word_clean) > 3 and
                word_clean not in ['find', 'search', 'show', 'documents', 'files', 'about', 'containing']):
                meaningful_words.append(word_clean)

        if meaningful_words:
            return ' AND '.join(meaningful_words[:3])  # Limit to 3 terms

        return None

    def _translate_to_sql_files(self, query: str) -> Optional[str]:
        """Translate to SQL file listing query."""
        query_lower = query.lower()

        if any(phrase in query_lower for phrase in ['all files', 'show files', 'list files']):
            return "SELECT id, filename, created_at FROM markdown_files ORDER BY created_at DESC LIMIT 20"

        return "SELECT filename FROM markdown_files ORDER BY filename"

    def _translate_to_sql_stats(self, query: str) -> Optional[str]:
        """Translate to SQL statistics query."""
        query_lower = query.lower()

        if 'files' in query_lower:
            return "SELECT COUNT(*) as file_count FROM markdown_files"
        elif 'schema' in query_lower:
            return "SELECT COUNT(*) as schema_count FROM schemas"
        else:
            return "SELECT (SELECT COUNT(*) FROM markdown_files) as files, (SELECT COUNT(*) FROM schemas) as schemas"

    def _translate_to_sql_size(self, query: str) -> Optional[str]:
        """Translate to SQL size/length query."""
        query_lower = query.lower()

        if any(word in query_lower for word in ['largest', 'biggest']):
            return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size DESC LIMIT 10"
        elif any(word in query_lower for word in ['smallest', 'small']):
            return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size ASC LIMIT 10"
        else:
            return "SELECT filename, LENGTH(content) as size FROM markdown_files WHERE content IS NOT NULL ORDER BY size DESC LIMIT 10"

    def _translate_to_sql_recent(self, query: str) -> Optional[str]:
        """Translate to SQL recent files query."""
        return "SELECT filename, created_at FROM markdown_files WHERE created_at > datetime('now', '-7 days') ORDER BY created_at DESC"

    def _translate_to_sql_frontmatter(self, query: str) -> Optional[str]:
        """Translate to SQL front matter query."""
        return "SELECT filename, front_matter FROM markdown_files WHERE front_matter IS NOT NULL AND front_matter != '{}'"

    def _translate_to_graphql_schemas(self, query: str) -> Optional[str]:
        """Translate to GraphQL schema query."""
        return """query {
  schemas {
    filename
    title
    description
    schemaVersion
    propertyCount
  }
}"""

    def _translate_to_graphql_detailed(self, query: str) -> Optional[str]:
        """Translate to detailed GraphQL query."""
        query_lower = query.lower()

        if 'file' in query_lower:
            return """query {
  markdownFiles(limit: 10) {
    id
    filename
    wordCount
    lineCount
    frontMatter {
      key
      value
    }
    createdAt
  }
}"""
        else:
            return """query {
  databaseStats {
    totalFiles
    totalSchemas
    totalSizeBytes
    lastUpdated
  }
}"""

    def _translate_to_jsonpath(self, query: str) -> Optional[str]:
        """Translate to JSONPath query."""
        query_lower = query.lower()

        if 'heading' in query_lower:
            return "$..heading"
        elif 'link' in query_lower:
            return "$..link"
        elif 'code' in query_lower:
            return "$..code_block"
        elif 'image' in query_lower:
            return "$..image"
        else:
            return "$..heading"  # Default to headings

    def can_translate_from(self, other_paradigm: str) -> bool:
        """Natural language doesn't translate from other paradigms."""
        return False

    def translate_query(self, query: str, from_paradigm: str) -> Optional[str]:
        """Natural language doesn't translate from other paradigms."""
        return None