90 lines
2.8 KiB
Python
90 lines
2.8 KiB
Python
"""
|
|
Content parser for extracting markdown content without matter zones.
|
|
"""
|
|
|
|
import re
|
|
from typing import Optional
|
|
from .stats import ContentStats
|
|
|
|
|
|
class ContentParser:
|
|
"""Parser for extracting content from MarkdownMatters documents."""
|
|
|
|
def extract_content(self, text: str) -> str:
|
|
"""
|
|
Extract main content without frontmatter and tailmatter.
|
|
|
|
Args:
|
|
text: Full markdown document text
|
|
|
|
Returns:
|
|
Content without frontmatter and tailmatter zones
|
|
"""
|
|
# Remove frontmatter
|
|
content = self._remove_frontmatter(text)
|
|
|
|
# Remove tailmatter
|
|
content = self._remove_tailmatter(content)
|
|
|
|
return content.strip()
|
|
|
|
def calculate_stats(self, content: str) -> ContentStats:
|
|
"""
|
|
Calculate statistics for content.
|
|
|
|
Args:
|
|
content: The content text to analyze
|
|
|
|
Returns:
|
|
ContentStats object with calculated statistics
|
|
"""
|
|
# Count lines
|
|
lines = content.split('\n')
|
|
line_count = len(lines)
|
|
|
|
# Count words (split by whitespace)
|
|
words = content.split()
|
|
word_count = len(words)
|
|
|
|
# Count paragraphs (non-empty text blocks separated by blank lines)
|
|
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
|
|
paragraph_count = len(paragraphs)
|
|
|
|
# Count characters
|
|
character_count = len(content)
|
|
|
|
return ContentStats(
|
|
word_count=word_count,
|
|
line_count=line_count,
|
|
paragraph_count=paragraph_count,
|
|
character_count=character_count
|
|
)
|
|
|
|
def _remove_frontmatter(self, text: str) -> str:
|
|
"""Remove YAML/TOML/JSON frontmatter from text."""
|
|
# Pattern for YAML frontmatter (---...---)
|
|
yaml_pattern = r'^---\s*\n.*?\n---\s*\n'
|
|
|
|
# Remove YAML frontmatter if present
|
|
text = re.sub(yaml_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
|
|
|
|
# TODO: Add support for TOML and JSON frontmatter in future cycles
|
|
|
|
return text
|
|
|
|
def _remove_tailmatter(self, text: str) -> str:
|
|
"""Remove tailmatter blocks from text."""
|
|
# Pattern for tailmatter: ```yaml tailmatter or ```json tailmatter
|
|
# Usually preceded by horizontal rule (---)
|
|
|
|
# Look for the pattern: --- followed by ```yaml tailmatter or ```json tailmatter
|
|
tailmatter_pattern = r'\n---\s*\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$'
|
|
|
|
# Remove tailmatter if present
|
|
text = re.sub(tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
|
|
|
|
# Also handle cases where tailmatter is at the end without preceding ---
|
|
simple_tailmatter_pattern = r'\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$'
|
|
text = re.sub(simple_tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
|
|
|
|
return text |