""" Content parser for extracting markdown content without matter zones. """ import re from typing import Optional from .stats import ContentStats class ContentParser: """Parser for extracting content from MarkdownMatters documents.""" def extract_content(self, text: str) -> str: """ Extract main content without frontmatter and tailmatter. Args: text: Full markdown document text Returns: Content without frontmatter and tailmatter zones """ # Remove frontmatter content = self._remove_frontmatter(text) # Remove tailmatter content = self._remove_tailmatter(content) return content.strip() def calculate_stats(self, content: str) -> ContentStats: """ Calculate statistics for content. Args: content: The content text to analyze Returns: ContentStats object with calculated statistics """ # Count lines lines = content.split('\n') line_count = len(lines) # Count words (split by whitespace) words = content.split() word_count = len(words) # Count paragraphs (non-empty text blocks separated by blank lines) paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] paragraph_count = len(paragraphs) # Count characters character_count = len(content) return ContentStats( word_count=word_count, line_count=line_count, paragraph_count=paragraph_count, character_count=character_count ) def _remove_frontmatter(self, text: str) -> str: """Remove YAML/TOML/JSON frontmatter from text.""" # Pattern for YAML frontmatter (---...---) yaml_pattern = r'^---\s*\n.*?\n---\s*\n' # Remove YAML frontmatter if present text = re.sub(yaml_pattern, '', text, flags=re.DOTALL | re.MULTILINE) # TODO: Add support for TOML and JSON frontmatter in future cycles return text def _remove_tailmatter(self, text: str) -> str: """Remove tailmatter blocks from text.""" # Pattern for tailmatter: ```yaml tailmatter or ```json tailmatter # Usually preceded by horizontal rule (---) # Look for the pattern: --- followed by ```yaml tailmatter or ```json tailmatter tailmatter_pattern = r'\n---\s*\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$' # Remove tailmatter if present text = re.sub(tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE) # Also handle cases where tailmatter is at the end without preceding --- simple_tailmatter_pattern = r'\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$' text = re.sub(simple_tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE) return text