markitect-main/markitect/content/parser.py

"""
Content parser for extracting markdown content without matter zones.
"""

import re
from typing import Optional
from .stats import ContentStats


class ContentParser:
    """Parser for extracting content from MarkdownMatters documents."""

    def extract_content(self, text: str) -> str:
        """
        Extract main content without frontmatter and tailmatter.

        Args:
            text: Full markdown document text

        Returns:
            Content without frontmatter and tailmatter zones
        """
        # Remove frontmatter
        content = self._remove_frontmatter(text)

        # Remove tailmatter
        content = self._remove_tailmatter(content)

        return content.strip()

    def calculate_stats(self, content: str) -> ContentStats:
        """
        Calculate statistics for content.

        Args:
            content: The content text to analyze

        Returns:
            ContentStats object with calculated statistics
        """
        # Count lines
        lines = content.split('\n')
        line_count = len(lines)

        # Count words (split by whitespace)
        words = content.split()
        word_count = len(words)

        # Count paragraphs (non-empty text blocks separated by blank lines)
        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
        paragraph_count = len(paragraphs)

        # Count characters
        character_count = len(content)

        return ContentStats(
            word_count=word_count,
            line_count=line_count,
            paragraph_count=paragraph_count,
            character_count=character_count
        )

    def _remove_frontmatter(self, text: str) -> str:
        """Remove YAML/TOML/JSON frontmatter from text."""
        # Pattern for YAML frontmatter (---...---)
        yaml_pattern = r'^---\s*\n.*?\n---\s*\n'

        # Remove YAML frontmatter if present
        text = re.sub(yaml_pattern, '', text, flags=re.DOTALL | re.MULTILINE)

        # TODO: Add support for TOML and JSON frontmatter in future cycles

        return text

    def _remove_tailmatter(self, text: str) -> str:
        """Remove tailmatter blocks from text."""
        # Pattern for tailmatter: ```yaml tailmatter or ```json tailmatter
        # Usually preceded by horizontal rule (---)

        # Look for the pattern: --- followed by ```yaml tailmatter or ```json tailmatter
        tailmatter_pattern = r'\n---\s*\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$'

        # Remove tailmatter if present
        text = re.sub(tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE)

        # Also handle cases where tailmatter is at the end without preceding ---
        simple_tailmatter_pattern = r'\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$'
        text = re.sub(simple_tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE)

        return text