""" Frontmatter parser for extracting and manipulating YAML/JSON/TOML frontmatter. """ import re import yaml import json import toml from typing import Dict, Any, List, Optional from .stats import FrontmatterStats class FrontmatterParser: """Parser for frontmatter in MarkdownMatters documents.""" def extract_frontmatter(self, text: str) -> Dict[str, Any]: """ Extract frontmatter from markdown text. Args: text: Full markdown document text Returns: Dictionary containing frontmatter data """ frontmatter_content = self._extract_frontmatter_content(text) if not frontmatter_content: return {} # Try to detect format first for better parsing content = frontmatter_content.strip() # Try TOML first if it looks like TOML if '=' in content and ('[' in content or '"' in content): try: return toml.loads(frontmatter_content) except toml.TomlDecodeError: pass # Try JSON if it looks like JSON if content.startswith('{') and content.endswith('}'): try: return json.loads(frontmatter_content) except json.JSONDecodeError: pass # Default to YAML (most common) try: result = yaml.safe_load(frontmatter_content) # Ensure we got a dictionary, not a string if isinstance(result, dict): return result except yaml.YAMLError: pass return {} def set_frontmatter_value(self, text: str, key: str, value: Any) -> str: """ Set a frontmatter value in the document. Args: text: Full markdown document text key: Frontmatter key (supports dot notation for nested) value: Value to set Returns: Updated document text """ frontmatter = self.extract_frontmatter(text) # Handle nested keys with dot notation if '.' in key: self._set_nested_value(frontmatter, key, value) else: frontmatter[key] = value # Replace or add frontmatter block return self._update_frontmatter_in_text(text, frontmatter) def get_frontmatter_keys(self, text: str, include_nested: bool = False) -> List[str]: """ Get list of frontmatter keys. Args: text: Full markdown document text include_nested: Include nested keys with dot notation Returns: List of frontmatter keys """ frontmatter = self.extract_frontmatter(text) if not include_nested: return list(frontmatter.keys()) return self._get_all_keys_recursive(frontmatter) def get_nested_value(self, frontmatter: Dict[str, Any], key: str) -> Any: """ Get nested value using dot notation. Args: frontmatter: Frontmatter dictionary key: Key with dot notation (e.g., "nested.category") Returns: Value or None if not found """ keys = key.split('.') current = frontmatter for k in keys: if isinstance(current, dict) and k in current: current = current[k] else: return None return current def calculate_frontmatter_stats(self, text: str) -> FrontmatterStats: """ Calculate statistics for frontmatter. Args: text: Full markdown document text Returns: FrontmatterStats object """ frontmatter = self.extract_frontmatter(text) if not frontmatter or not isinstance(frontmatter, dict): return FrontmatterStats( has_frontmatter=False, total_fields=0, nested_fields=0, format=None, field_types={} ) # Detect format format_type = self._detect_frontmatter_format(text) # Count fields total_fields = len(frontmatter) nested_fields = self._count_nested_fields(frontmatter) # Analyze field types field_types = self._analyze_field_types(frontmatter) return FrontmatterStats( has_frontmatter=True, total_fields=total_fields, nested_fields=nested_fields, format=format_type, field_types=field_types ) def _extract_frontmatter_content(self, text: str) -> Optional[str]: """Extract the raw frontmatter content between delimiters.""" # Pattern for YAML frontmatter (---...---) yaml_pattern = r'^---\s*\n(.*?)\n---\s*\n' match = re.search(yaml_pattern, text, flags=re.DOTALL | re.MULTILINE) if match: return match.group(1).strip() return None def _detect_frontmatter_format(self, text: str) -> Optional[str]: """Detect the format of frontmatter (yaml, json, toml).""" content = self._extract_frontmatter_content(text) if not content: return None # Simple heuristics for format detection content = content.strip() if content.startswith('{') and content.endswith('}'): return "json" elif '=' in content and '[' in content: # Simple heuristic for TOML return "toml" else: # Default to YAML return "yaml" def _set_nested_value(self, data: Dict[str, Any], key: str, value: Any) -> None: """Set nested value using dot notation.""" keys = key.split('.') current = data # Navigate to the parent of the final key for k in keys[:-1]: if k not in current: current[k] = {} current = current[k] # Set the final value current[keys[-1]] = value def _get_all_keys_recursive(self, data: Dict[str, Any], prefix: str = "") -> List[str]: """Get all keys recursively with dot notation.""" keys = [] for key, value in data.items(): full_key = f"{prefix}.{key}" if prefix else key keys.append(full_key) if isinstance(value, dict): keys.extend(self._get_all_keys_recursive(value, full_key)) return keys def _count_nested_fields(self, data: Dict[str, Any]) -> int: """Count nested fields recursively.""" count = 0 for value in data.values(): if isinstance(value, dict): count += len(value) count += self._count_nested_fields(value) return count def _analyze_field_types(self, data: Dict[str, Any]) -> Dict[str, int]: """Analyze field types in frontmatter.""" type_counts = {} def count_types(obj): if isinstance(obj, dict): type_counts["object"] = type_counts.get("object", 0) + 1 for v in obj.values(): count_types(v) elif isinstance(obj, list): type_counts["array"] = type_counts.get("array", 0) + 1 for item in obj: count_types(item) elif isinstance(obj, bool): type_counts["boolean"] = type_counts.get("boolean", 0) + 1 elif isinstance(obj, (int, float)): type_counts["number"] = type_counts.get("number", 0) + 1 elif isinstance(obj, str): type_counts["string"] = type_counts.get("string", 0) + 1 # Count top-level fields only for now for value in data.values(): count_types(value) return type_counts def _update_frontmatter_in_text(self, text: str, frontmatter: Dict[str, Any]) -> str: """Update or add frontmatter block in text.""" # Convert frontmatter to YAML frontmatter_yaml = yaml.dump(frontmatter, default_flow_style=False) # Check if text already has frontmatter yaml_pattern = r'^---\s*\n.*?\n---\s*\n' if re.search(yaml_pattern, text, flags=re.DOTALL | re.MULTILINE): # Replace existing frontmatter new_frontmatter = f"---\n{frontmatter_yaml}---\n" return re.sub(yaml_pattern, new_frontmatter, text, flags=re.DOTALL | re.MULTILINE) else: # Add frontmatter to beginning new_frontmatter = f"---\n{frontmatter_yaml}---\n\n" return new_frontmatter + text def separate_frontmatter_and_content(self, text: str) -> tuple[Dict[str, Any], str]: """ Separate frontmatter from content. Args: text: Full markdown document text Returns: Tuple of (frontmatter_dict, content_without_frontmatter) """ frontmatter = self.extract_frontmatter(text) # Remove frontmatter from content yaml_pattern = r'^---\s*\n.*?\n---\s*\n' content = re.sub(yaml_pattern, '', text, flags=re.DOTALL | re.MULTILINE) return frontmatter, content.lstrip('\n')