- Enhanced frontmatter parser to detect and parse TOML format - Added TOML format detection heuristics before YAML parsing - Created TOML test fixture with nested sections - Fixed parsing order to prevent TOML-to-string conversion - All frontmatter formats (YAML, JSON, TOML) now fully supported - Validated all acceptance criteria for Issue #41 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
268 lines
8.4 KiB
Python
268 lines
8.4 KiB
Python
"""
|
|
Frontmatter parser for extracting and manipulating YAML/JSON/TOML frontmatter.
|
|
"""
|
|
|
|
import re
|
|
import yaml
|
|
import json
|
|
import toml
|
|
from typing import Dict, Any, List, Optional
|
|
from .stats import FrontmatterStats
|
|
|
|
|
|
class FrontmatterParser:
|
|
"""Parser for frontmatter in MarkdownMatters documents."""
|
|
|
|
def extract_frontmatter(self, text: str) -> Dict[str, Any]:
|
|
"""
|
|
Extract frontmatter from markdown text.
|
|
|
|
Args:
|
|
text: Full markdown document text
|
|
|
|
Returns:
|
|
Dictionary containing frontmatter data
|
|
"""
|
|
frontmatter_content = self._extract_frontmatter_content(text)
|
|
|
|
if not frontmatter_content:
|
|
return {}
|
|
|
|
# Try to detect format first for better parsing
|
|
content = frontmatter_content.strip()
|
|
|
|
# Try TOML first if it looks like TOML
|
|
if '=' in content and ('[' in content or '"' in content):
|
|
try:
|
|
return toml.loads(frontmatter_content)
|
|
except toml.TomlDecodeError:
|
|
pass
|
|
|
|
# Try JSON if it looks like JSON
|
|
if content.startswith('{') and content.endswith('}'):
|
|
try:
|
|
return json.loads(frontmatter_content)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Default to YAML (most common)
|
|
try:
|
|
result = yaml.safe_load(frontmatter_content)
|
|
# Ensure we got a dictionary, not a string
|
|
if isinstance(result, dict):
|
|
return result
|
|
except yaml.YAMLError:
|
|
pass
|
|
|
|
return {}
|
|
|
|
def set_frontmatter_value(self, text: str, key: str, value: Any) -> str:
|
|
"""
|
|
Set a frontmatter value in the document.
|
|
|
|
Args:
|
|
text: Full markdown document text
|
|
key: Frontmatter key (supports dot notation for nested)
|
|
value: Value to set
|
|
|
|
Returns:
|
|
Updated document text
|
|
"""
|
|
frontmatter = self.extract_frontmatter(text)
|
|
|
|
# Handle nested keys with dot notation
|
|
if '.' in key:
|
|
self._set_nested_value(frontmatter, key, value)
|
|
else:
|
|
frontmatter[key] = value
|
|
|
|
# Replace or add frontmatter block
|
|
return self._update_frontmatter_in_text(text, frontmatter)
|
|
|
|
def get_frontmatter_keys(self, text: str, include_nested: bool = False) -> List[str]:
|
|
"""
|
|
Get list of frontmatter keys.
|
|
|
|
Args:
|
|
text: Full markdown document text
|
|
include_nested: Include nested keys with dot notation
|
|
|
|
Returns:
|
|
List of frontmatter keys
|
|
"""
|
|
frontmatter = self.extract_frontmatter(text)
|
|
|
|
if not include_nested:
|
|
return list(frontmatter.keys())
|
|
|
|
return self._get_all_keys_recursive(frontmatter)
|
|
|
|
def get_nested_value(self, frontmatter: Dict[str, Any], key: str) -> Any:
|
|
"""
|
|
Get nested value using dot notation.
|
|
|
|
Args:
|
|
frontmatter: Frontmatter dictionary
|
|
key: Key with dot notation (e.g., "nested.category")
|
|
|
|
Returns:
|
|
Value or None if not found
|
|
"""
|
|
keys = key.split('.')
|
|
current = frontmatter
|
|
|
|
for k in keys:
|
|
if isinstance(current, dict) and k in current:
|
|
current = current[k]
|
|
else:
|
|
return None
|
|
|
|
return current
|
|
|
|
def calculate_frontmatter_stats(self, text: str) -> FrontmatterStats:
|
|
"""
|
|
Calculate statistics for frontmatter.
|
|
|
|
Args:
|
|
text: Full markdown document text
|
|
|
|
Returns:
|
|
FrontmatterStats object
|
|
"""
|
|
frontmatter = self.extract_frontmatter(text)
|
|
|
|
if not frontmatter or not isinstance(frontmatter, dict):
|
|
return FrontmatterStats(
|
|
has_frontmatter=False,
|
|
total_fields=0,
|
|
nested_fields=0,
|
|
format=None,
|
|
field_types={}
|
|
)
|
|
|
|
# Detect format
|
|
format_type = self._detect_frontmatter_format(text)
|
|
|
|
# Count fields
|
|
total_fields = len(frontmatter)
|
|
nested_fields = self._count_nested_fields(frontmatter)
|
|
|
|
# Analyze field types
|
|
field_types = self._analyze_field_types(frontmatter)
|
|
|
|
return FrontmatterStats(
|
|
has_frontmatter=True,
|
|
total_fields=total_fields,
|
|
nested_fields=nested_fields,
|
|
format=format_type,
|
|
field_types=field_types
|
|
)
|
|
|
|
def _extract_frontmatter_content(self, text: str) -> Optional[str]:
|
|
"""Extract the raw frontmatter content between delimiters."""
|
|
# Pattern for YAML frontmatter (---...---)
|
|
yaml_pattern = r'^---\s*\n(.*?)\n---\s*\n'
|
|
|
|
match = re.search(yaml_pattern, text, flags=re.DOTALL | re.MULTILINE)
|
|
if match:
|
|
return match.group(1).strip()
|
|
|
|
return None
|
|
|
|
def _detect_frontmatter_format(self, text: str) -> Optional[str]:
|
|
"""Detect the format of frontmatter (yaml, json, toml)."""
|
|
content = self._extract_frontmatter_content(text)
|
|
if not content:
|
|
return None
|
|
|
|
# Simple heuristics for format detection
|
|
content = content.strip()
|
|
if content.startswith('{') and content.endswith('}'):
|
|
return "json"
|
|
elif '=' in content and '[' in content:
|
|
# Simple heuristic for TOML
|
|
return "toml"
|
|
else:
|
|
# Default to YAML
|
|
return "yaml"
|
|
|
|
def _set_nested_value(self, data: Dict[str, Any], key: str, value: Any) -> None:
|
|
"""Set nested value using dot notation."""
|
|
keys = key.split('.')
|
|
current = data
|
|
|
|
# Navigate to the parent of the final key
|
|
for k in keys[:-1]:
|
|
if k not in current:
|
|
current[k] = {}
|
|
current = current[k]
|
|
|
|
# Set the final value
|
|
current[keys[-1]] = value
|
|
|
|
def _get_all_keys_recursive(self, data: Dict[str, Any], prefix: str = "") -> List[str]:
|
|
"""Get all keys recursively with dot notation."""
|
|
keys = []
|
|
|
|
for key, value in data.items():
|
|
full_key = f"{prefix}.{key}" if prefix else key
|
|
keys.append(full_key)
|
|
|
|
if isinstance(value, dict):
|
|
keys.extend(self._get_all_keys_recursive(value, full_key))
|
|
|
|
return keys
|
|
|
|
def _count_nested_fields(self, data: Dict[str, Any]) -> int:
|
|
"""Count nested fields recursively."""
|
|
count = 0
|
|
|
|
for value in data.values():
|
|
if isinstance(value, dict):
|
|
count += len(value)
|
|
count += self._count_nested_fields(value)
|
|
|
|
return count
|
|
|
|
def _analyze_field_types(self, data: Dict[str, Any]) -> Dict[str, int]:
|
|
"""Analyze field types in frontmatter."""
|
|
type_counts = {}
|
|
|
|
def count_types(obj):
|
|
if isinstance(obj, dict):
|
|
type_counts["object"] = type_counts.get("object", 0) + 1
|
|
for v in obj.values():
|
|
count_types(v)
|
|
elif isinstance(obj, list):
|
|
type_counts["array"] = type_counts.get("array", 0) + 1
|
|
for item in obj:
|
|
count_types(item)
|
|
elif isinstance(obj, bool):
|
|
type_counts["boolean"] = type_counts.get("boolean", 0) + 1
|
|
elif isinstance(obj, (int, float)):
|
|
type_counts["number"] = type_counts.get("number", 0) + 1
|
|
elif isinstance(obj, str):
|
|
type_counts["string"] = type_counts.get("string", 0) + 1
|
|
|
|
# Count top-level fields only for now
|
|
for value in data.values():
|
|
count_types(value)
|
|
|
|
return type_counts
|
|
|
|
def _update_frontmatter_in_text(self, text: str, frontmatter: Dict[str, Any]) -> str:
|
|
"""Update or add frontmatter block in text."""
|
|
# Convert frontmatter to YAML
|
|
frontmatter_yaml = yaml.dump(frontmatter, default_flow_style=False)
|
|
|
|
# Check if text already has frontmatter
|
|
yaml_pattern = r'^---\s*\n.*?\n---\s*\n'
|
|
|
|
if re.search(yaml_pattern, text, flags=re.DOTALL | re.MULTILINE):
|
|
# Replace existing frontmatter
|
|
new_frontmatter = f"---\n{frontmatter_yaml}---\n"
|
|
return re.sub(yaml_pattern, new_frontmatter, text, flags=re.DOTALL | re.MULTILINE)
|
|
else:
|
|
# Add frontmatter to beginning
|
|
new_frontmatter = f"---\n{frontmatter_yaml}---\n\n"
|
|
return new_frontmatter + text |