""" Template parser for extracting and analyzing template variables. This module provides the core parsing functionality for the MarkiTect template engine, focusing on variable extraction and template syntax analysis. """ import re from typing import List, Set, Optional, Dict, Any from dataclasses import dataclass class TemplateParsingError(Exception): """Base exception for template parsing errors.""" def __init__(self, message: str, position: Optional[int] = None, context: Optional[str] = None): self.position = position self.context = context super().__init__(message) class InvalidVariableSyntaxError(TemplateParsingError): """Raised when variable syntax is invalid.""" pass @dataclass class TemplateAnalysis: """Structured template analysis results.""" total_variables: int unique_variables: int variables: List[str] root_variables: List[str] nested_variables: List[str] max_nesting_depth: int syntax_errors: List[str] class TemplateParser: """Parser for template variables and syntax analysis.""" # Regular expression to match template variables {{variable}} or {{object.property}} # Supports unicode characters in variable names VARIABLE_PATTERN = re.compile(r'\{\{\s*([a-zA-Z_\u00a0-\uffff][a-zA-Z0-9_\u00a0-\uffff]*(?:\.[a-zA-Z_\u00a0-\uffff][a-zA-Z0-9_\u00a0-\uffff]*)*)\s*\}\}', re.UNICODE) def __init__(self): """Initialize the template parser.""" self._validation_pattern = None def extract_variables(self, template_text: str) -> List[str]: """ Extract all template variables from the given text. Args: template_text: The template content to parse Returns: List of variable names found in the template (without duplicates) """ if not template_text: return [] # Find all matches using the regex pattern matches = self.VARIABLE_PATTERN.findall(template_text) # Use dict.fromkeys() for O(1) deduplication while preserving order return list(dict.fromkeys(matches)) def get_variable_set(self, template_text: str) -> Set[str]: """ Get a set of unique variables from the template. Args: template_text: The template content to parse Returns: Set of unique variable names """ return set(self.extract_variables(template_text)) @property def _cached_validation_pattern(self) -> re.Pattern: """Lazy-loaded validation pattern to avoid recompilation.""" if self._validation_pattern is None: self._validation_pattern = re.compile( r'\{\{\s*[a-zA-Z_\u00a0-\uffff][a-zA-Z0-9_\u00a0-\uffff]*(?:\.[a-zA-Z_\u00a0-\uffff][a-zA-Z0-9_\u00a0-\uffff]*)*\s*\}\}', re.UNICODE ) return self._validation_pattern def validate_variable_syntax(self, template_text: str) -> List[str]: """ Validate template variable syntax and return any errors. Args: template_text: The template content to validate Returns: List of error messages for invalid syntax """ errors = [] errors.extend(self._check_brace_matching(template_text)) errors.extend(self._check_variable_format(template_text)) return errors def _check_brace_matching(self, template_text: str) -> List[str]: """Check for unmatched braces.""" errors = [] # Look for potential template variable patterns (single or double braces) potential_vars = re.findall(r'\{+[^}]*\}*', template_text) for potential in potential_vars: if potential.count('{') != potential.count('}'): errors.append(f"Unmatched braces in: {potential}") return errors def _check_variable_format(self, template_text: str) -> List[str]: """Check variable name format compliance.""" errors = [] # Only check patterns that look like they should be template variables # Look for double-brace patterns specifically potential_vars = re.findall(r'\{\{[^}]*\}\}?', template_text) for potential in potential_vars: if not self._cached_validation_pattern.match(potential): if '{{' in potential and '}}' in potential: errors.append(f"Invalid variable syntax: {potential}") return errors def is_valid_variable_name(self, variable_name: str) -> bool: """ Check if a variable name follows valid naming conventions. Args: variable_name: The variable name to validate Returns: True if the variable name is valid, False otherwise """ if not variable_name: return False # Split on dots for nested property access parts = variable_name.split('.') for part in parts: # Each part must be a valid identifier (supporting unicode) if not re.match(r'^[a-zA-Z_\u00a0-\uffff][a-zA-Z0-9_\u00a0-\uffff]*$', part, re.UNICODE): return False return True def get_nested_depth(self, variable_name: str) -> int: """ Get the nesting depth of a variable (number of dots + 1). Args: variable_name: The variable name to analyze Returns: Depth of nesting (1 for simple variables, >1 for nested) """ return len(variable_name.split('.')) def get_root_variables(self, template_text: str) -> Set[str]: """ Get only the root-level variables (without nested properties). Args: template_text: The template content to parse Returns: Set of root variable names """ variables = self.get_variable_set(template_text) root_vars = set() for var in variables: root = var.split('.')[0] root_vars.add(root) return root_vars def analyze_template(self, template_text: str) -> TemplateAnalysis: """ Perform comprehensive analysis of a template. Args: template_text: The template content to analyze Returns: TemplateAnalysis containing structured analysis results """ variables = self.extract_variables(template_text) return TemplateAnalysis( total_variables=len(variables), unique_variables=len(set(variables)), variables=variables, root_variables=list(self.get_root_variables(template_text)), nested_variables=[var for var in variables if '.' in var], max_nesting_depth=max([self.get_nested_depth(var) for var in variables]) if variables else 0, syntax_errors=self.validate_variable_syntax(template_text) )