Files
markitect-main/markitect/matter_frontmatter/parser.py
tegwick 4f16166e94 feat: implement comprehensive front matter preservation and unicode handling
This commit provides complete front matter support and fixes unicode character
handling across all explode-implode variants (flat, hierarchical, semantic).

## Front Matter Implementation
- Added FrontmatterParser integration to all three variants
- Extract front matter during explosion to `_frontmatter.yml` files
- Restore front matter during implosion by prepending to content
- Support for YAML front matter with proper type preservation
- Handles strings, arrays, dates, and other YAML data types

## Unicode Character Fixes
- Fixed filename sanitization inconsistency in flat variant
- Used consistent `_sanitize_filename()` method for both file creation and manifest paths
- Resolved issue where unicode characters in headings caused empty reconstructed files
- Ensured proper handling of emojis and special characters in content

## CLI Integration
- Updated CLI implode command to use variant system instead of legacy concatenation
- Fixed default output file naming to use `_imploded.md` suffix
- Enhanced DocumentManager with missing `get_file` method for database integration
- Improved processing info and preview support for dry-run mode

## Test Coverage
- Reactivated `test_issue_149_roundtrip_validation.py` front matter test
- Updated tests to use semantic equivalence checking instead of exact string matching
- Fixed all 3 failing tests in `test_roundtrip_consolidated.py`
- All 10 roundtrip tests and 11 Issue #149 validation tests now pass

## Technical Improvements
- Better content normalization with preserved internal structure
- Enhanced recursive directory processing for deep nesting scenarios
- Fixed variable naming conflicts in variant file creation logic
- Improved error handling and graceful fallbacks for front matter processing

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-10-13 20:26:08 +02:00

286 lines
8.9 KiB
Python

"""
Frontmatter parser for extracting and manipulating YAML/JSON/TOML frontmatter.
"""
import re
import yaml
import json
import toml
from typing import Dict, Any, List, Optional
from .stats import FrontmatterStats
class FrontmatterParser:
"""Parser for frontmatter in MarkdownMatters documents."""
def extract_frontmatter(self, text: str) -> Dict[str, Any]:
"""
Extract frontmatter from markdown text.
Args:
text: Full markdown document text
Returns:
Dictionary containing frontmatter data
"""
frontmatter_content = self._extract_frontmatter_content(text)
if not frontmatter_content:
return {}
# Try to detect format first for better parsing
content = frontmatter_content.strip()
# Try TOML first if it looks like TOML
if '=' in content and ('[' in content or '"' in content):
try:
return toml.loads(frontmatter_content)
except toml.TomlDecodeError:
pass
# Try JSON if it looks like JSON
if content.startswith('{') and content.endswith('}'):
try:
return json.loads(frontmatter_content)
except json.JSONDecodeError:
pass
# Default to YAML (most common)
try:
result = yaml.safe_load(frontmatter_content)
# Ensure we got a dictionary, not a string
if isinstance(result, dict):
return result
except yaml.YAMLError:
pass
return {}
def set_frontmatter_value(self, text: str, key: str, value: Any) -> str:
"""
Set a frontmatter value in the document.
Args:
text: Full markdown document text
key: Frontmatter key (supports dot notation for nested)
value: Value to set
Returns:
Updated document text
"""
frontmatter = self.extract_frontmatter(text)
# Handle nested keys with dot notation
if '.' in key:
self._set_nested_value(frontmatter, key, value)
else:
frontmatter[key] = value
# Replace or add frontmatter block
return self._update_frontmatter_in_text(text, frontmatter)
def get_frontmatter_keys(self, text: str, include_nested: bool = False) -> List[str]:
"""
Get list of frontmatter keys.
Args:
text: Full markdown document text
include_nested: Include nested keys with dot notation
Returns:
List of frontmatter keys
"""
frontmatter = self.extract_frontmatter(text)
if not include_nested:
return list(frontmatter.keys())
return self._get_all_keys_recursive(frontmatter)
def get_nested_value(self, frontmatter: Dict[str, Any], key: str) -> Any:
"""
Get nested value using dot notation.
Args:
frontmatter: Frontmatter dictionary
key: Key with dot notation (e.g., "nested.category")
Returns:
Value or None if not found
"""
keys = key.split('.')
current = frontmatter
for k in keys:
if isinstance(current, dict) and k in current:
current = current[k]
else:
return None
return current
def calculate_frontmatter_stats(self, text: str) -> FrontmatterStats:
"""
Calculate statistics for frontmatter.
Args:
text: Full markdown document text
Returns:
FrontmatterStats object
"""
frontmatter = self.extract_frontmatter(text)
if not frontmatter or not isinstance(frontmatter, dict):
return FrontmatterStats(
has_frontmatter=False,
total_fields=0,
nested_fields=0,
format=None,
field_types={}
)
# Detect format
format_type = self._detect_frontmatter_format(text)
# Count fields
total_fields = len(frontmatter)
nested_fields = self._count_nested_fields(frontmatter)
# Analyze field types
field_types = self._analyze_field_types(frontmatter)
return FrontmatterStats(
has_frontmatter=True,
total_fields=total_fields,
nested_fields=nested_fields,
format=format_type,
field_types=field_types
)
def _extract_frontmatter_content(self, text: str) -> Optional[str]:
"""Extract the raw frontmatter content between delimiters."""
# Pattern for YAML frontmatter (---...---)
yaml_pattern = r'^---\s*\n(.*?)\n---\s*\n'
match = re.search(yaml_pattern, text, flags=re.DOTALL | re.MULTILINE)
if match:
return match.group(1).strip()
return None
def _detect_frontmatter_format(self, text: str) -> Optional[str]:
"""Detect the format of frontmatter (yaml, json, toml)."""
content = self._extract_frontmatter_content(text)
if not content:
return None
# Simple heuristics for format detection
content = content.strip()
if content.startswith('{') and content.endswith('}'):
return "json"
elif '=' in content and '[' in content:
# Simple heuristic for TOML
return "toml"
else:
# Default to YAML
return "yaml"
def _set_nested_value(self, data: Dict[str, Any], key: str, value: Any) -> None:
"""Set nested value using dot notation."""
keys = key.split('.')
current = data
# Navigate to the parent of the final key
for k in keys[:-1]:
if k not in current:
current[k] = {}
current = current[k]
# Set the final value
current[keys[-1]] = value
def _get_all_keys_recursive(self, data: Dict[str, Any], prefix: str = "") -> List[str]:
"""Get all keys recursively with dot notation."""
keys = []
for key, value in data.items():
full_key = f"{prefix}.{key}" if prefix else key
keys.append(full_key)
if isinstance(value, dict):
keys.extend(self._get_all_keys_recursive(value, full_key))
return keys
def _count_nested_fields(self, data: Dict[str, Any]) -> int:
"""Count nested fields recursively."""
count = 0
for value in data.values():
if isinstance(value, dict):
count += len(value)
count += self._count_nested_fields(value)
return count
def _analyze_field_types(self, data: Dict[str, Any]) -> Dict[str, int]:
"""Analyze field types in frontmatter."""
type_counts = {}
def count_types(obj):
if isinstance(obj, dict):
type_counts["object"] = type_counts.get("object", 0) + 1
for v in obj.values():
count_types(v)
elif isinstance(obj, list):
type_counts["array"] = type_counts.get("array", 0) + 1
for item in obj:
count_types(item)
elif isinstance(obj, bool):
type_counts["boolean"] = type_counts.get("boolean", 0) + 1
elif isinstance(obj, (int, float)):
type_counts["number"] = type_counts.get("number", 0) + 1
elif isinstance(obj, str):
type_counts["string"] = type_counts.get("string", 0) + 1
# Count top-level fields only for now
for value in data.values():
count_types(value)
return type_counts
def _update_frontmatter_in_text(self, text: str, frontmatter: Dict[str, Any]) -> str:
"""Update or add frontmatter block in text."""
# Convert frontmatter to YAML
frontmatter_yaml = yaml.dump(frontmatter, default_flow_style=False)
# Check if text already has frontmatter
yaml_pattern = r'^---\s*\n.*?\n---\s*\n'
if re.search(yaml_pattern, text, flags=re.DOTALL | re.MULTILINE):
# Replace existing frontmatter
new_frontmatter = f"---\n{frontmatter_yaml}---\n"
return re.sub(yaml_pattern, new_frontmatter, text, flags=re.DOTALL | re.MULTILINE)
else:
# Add frontmatter to beginning
new_frontmatter = f"---\n{frontmatter_yaml}---\n\n"
return new_frontmatter + text
def separate_frontmatter_and_content(self, text: str) -> tuple[Dict[str, Any], str]:
"""
Separate frontmatter from content.
Args:
text: Full markdown document text
Returns:
Tuple of (frontmatter_dict, content_without_frontmatter)
"""
frontmatter = self.extract_frontmatter(text)
# Remove frontmatter from content
yaml_pattern = r'^---\s*\n.*?\n---\s*\n'
content = re.sub(yaml_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
return frontmatter, content.lstrip('\n')