""" Contentmatter parser for extracting and manipulating MultiMarkdown key-value pairs within content. """ import re from typing import Dict, List, Optional from .stats import ContentmatterStats class ContentmatterParser: """Parser for contentmatter (MultiMarkdown key-value pairs) in MarkdownMatters documents.""" def extract_contentmatter(self, text: str) -> Dict[str, str]: """ Extract contentmatter (MMD key-value pairs) from content only. Args: text: Full markdown document text Returns: Dictionary containing contentmatter key-value pairs """ # First extract only the content (remove frontmatter and tailmatter) content = self._extract_content_only(text) # Find all MMD key-value pairs in content return self._parse_mmd_keyvalues(content) def get_contentmatter_value(self, text: str, key: str) -> Optional[str]: """ Get specific contentmatter value by key. Args: text: Full markdown document text key: Key to retrieve Returns: Value or None if not found """ contentmatter = self.extract_contentmatter(text) return contentmatter.get(key) def set_contentmatter_value(self, text: str, key: str, value: str) -> str: """ Set a contentmatter value in the document. Args: text: Full markdown document text key: Key to set value: Value to set Returns: Updated document text """ # Extract content part to work with content = self._extract_content_only(text) # Check if key already exists existing_pattern = rf'^{re.escape(key)}:\s*.*$' if re.search(existing_pattern, content, re.MULTILINE): # Update existing key new_line = f"{key}: {value}" content = re.sub(existing_pattern, new_line, content, flags=re.MULTILINE) else: # Add new key-value pair after first heading or at start new_line = f"{key}: {value}\n" # Find first heading to add after it heading_match = re.search(r'^(#+\s+.*?)$', content, re.MULTILINE) if heading_match: insert_pos = heading_match.end() content = content[:insert_pos] + "\n\n" + new_line + content[insert_pos:] else: # Add at beginning of content content = new_line + "\n" + content # Reconstruct full document return self._reconstruct_document(text, content) def get_contentmatter_keys(self, text: str) -> List[str]: """ Get list of contentmatter keys. Args: text: Full markdown document text Returns: List of contentmatter keys """ contentmatter = self.extract_contentmatter(text) return list(contentmatter.keys()) def calculate_contentmatter_stats(self, text: str) -> ContentmatterStats: """ Calculate statistics for contentmatter. Args: text: Full markdown document text Returns: ContentmatterStats object """ contentmatter = self.extract_contentmatter(text) if not contentmatter: return ContentmatterStats( has_contentmatter=False, total_pairs=0, average_key_length=0.0, average_value_length=0.0, url_values=0, email_values=0, date_values=0 ) # Calculate basic stats total_pairs = len(contentmatter) key_lengths = [len(key) for key in contentmatter.keys()] value_lengths = [len(value) for value in contentmatter.values()] avg_key_length = sum(key_lengths) / len(key_lengths) if key_lengths else 0.0 avg_value_length = sum(value_lengths) / len(value_lengths) if value_lengths else 0.0 # Analyze value types url_values = self._count_url_values(contentmatter) email_values = self._count_email_values(contentmatter) date_values = self._count_date_values(contentmatter) return ContentmatterStats( has_contentmatter=True, total_pairs=total_pairs, average_key_length=avg_key_length, average_value_length=avg_value_length, url_values=url_values, email_values=email_values, date_values=date_values ) def _extract_content_only(self, text: str) -> str: """Extract only content, removing frontmatter and tailmatter.""" # Remove frontmatter content = re.sub(r'^---\s*\n.*?\n---\s*\n', '', text, flags=re.DOTALL | re.MULTILINE) # Remove tailmatter content = re.sub(r'\n---\s*\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$', '', content, flags=re.DOTALL | re.MULTILINE) content = re.sub(r'\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$', '', content, flags=re.DOTALL | re.MULTILINE) return content.strip() def _parse_mmd_keyvalues(self, content: str) -> Dict[str, str]: """Parse MultiMarkdown key-value pairs from content.""" contentmatter = {} # Pattern for MMD key-value pairs: "Key: Value" on its own line pattern = r'^([A-Za-z][A-Za-z0-9\s]*[A-Za-z0-9]):\s*(.+)$' for match in re.finditer(pattern, content, re.MULTILINE): key = match.group(1).strip() value = match.group(2).strip() contentmatter[key] = value return contentmatter def _count_url_values(self, contentmatter: Dict[str, str]) -> int: """Count values that are URLs.""" url_pattern = r'https?://' return sum(1 for value in contentmatter.values() if re.search(url_pattern, value)) def _count_email_values(self, contentmatter: Dict[str, str]) -> int: """Count values that are email addresses.""" email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' return sum(1 for value in contentmatter.values() if re.search(email_pattern, value)) def _count_date_values(self, contentmatter: Dict[str, str]) -> int: """Count values that look like dates.""" date_patterns = [ r'\d{4}-\d{2}-\d{2}', # YYYY-MM-DD r'\d{2}/\d{2}/\d{4}', # MM/DD/YYYY r'\d{2}-\d{2}-\d{4}', # MM-DD-YYYY ] count = 0 for value in contentmatter.values(): for pattern in date_patterns: if re.search(pattern, value): count += 1 break # Count each value only once return count def _reconstruct_document(self, original_text: str, new_content: str) -> str: """Reconstruct document with updated content.""" # Extract frontmatter if present frontmatter_match = re.search(r'^(---\s*\n.*?\n---\s*\n)', original_text, flags=re.DOTALL | re.MULTILINE) frontmatter = frontmatter_match.group(1) if frontmatter_match else "" # Extract tailmatter if present tailmatter_match = re.search(r'(\n---\s*\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*)$', original_text, flags=re.DOTALL | re.MULTILINE) if not tailmatter_match: tailmatter_match = re.search(r'(\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*)$', original_text, flags=re.DOTALL | re.MULTILINE) tailmatter = tailmatter_match.group(1) if tailmatter_match else "" # Reconstruct result = frontmatter + new_content + tailmatter return result