diff --git a/markitect/cli.py b/markitect/cli.py index 69d9894b..b1245cc6 100644 --- a/markitect/cli.py +++ b/markitect/cli.py @@ -3388,5 +3388,13 @@ def config_stats(config, format): sys.exit(1) +# Content Commands (Issue #38) +from .content.commands import content_get, content_stats + +# Register content commands +cli.add_command(content_get) +cli.add_command(content_stats) + + if __name__ == '__main__': main() \ No newline at end of file diff --git a/markitect/content/__init__.py b/markitect/content/__init__.py new file mode 100644 index 00000000..ade5c174 --- /dev/null +++ b/markitect/content/__init__.py @@ -0,0 +1,9 @@ +""" +Content module for MarkdownMatters CLI. +Handles content extraction without frontmatter and tailmatter zones. +""" + +from .parser import ContentParser +from .stats import ContentStats + +__all__ = ['ContentParser', 'ContentStats'] \ No newline at end of file diff --git a/markitect/content/commands.py b/markitect/content/commands.py new file mode 100644 index 00000000..992fdfee --- /dev/null +++ b/markitect/content/commands.py @@ -0,0 +1,57 @@ +""" +CLI commands for content operations. +""" + +import click +import json +from pathlib import Path +from .parser import ContentParser + + +@click.command('content-get') +@click.option('--file', 'file_path', required=True, type=click.Path(exists=True), + help='Path to markdown file') +def content_get(file_path): + """Extract content without frontmatter and tailmatter.""" + try: + file_path = Path(file_path) + with open(file_path, 'r', encoding='utf-8') as f: + text = f.read() + + parser = ContentParser() + content = parser.extract_content(text) + + click.echo(content) + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(f"Failed to extract content from {file_path}") + + +@click.command('content-stats') +@click.option('--file', 'file_path', required=True, type=click.Path(exists=True), + help='Path to markdown file') +@click.option('--format', 'output_format', default='json', type=click.Choice(['json', 'text']), + help='Output format (json or text)') +def content_stats(file_path, output_format): + """Calculate content statistics.""" + try: + file_path = Path(file_path) + with open(file_path, 'r', encoding='utf-8') as f: + text = f.read() + + parser = ContentParser() + content = parser.extract_content(text) + stats = parser.calculate_stats(content) + + if output_format == 'json': + click.echo(json.dumps(stats.to_dict(), indent=2)) + else: + click.echo(f"Word count: {stats.word_count}") + click.echo(f"Line count: {stats.line_count}") + click.echo(f"Paragraph count: {stats.paragraph_count}") + click.echo(f"Character count: {stats.character_count}") + + except Exception as e: + click.echo(f"Error: {e}", err=True) + raise click.ClickException(f"Failed to calculate stats for {file_path}") \ No newline at end of file diff --git a/markitect/content/parser.py b/markitect/content/parser.py new file mode 100644 index 00000000..75e2e380 --- /dev/null +++ b/markitect/content/parser.py @@ -0,0 +1,90 @@ +""" +Content parser for extracting markdown content without matter zones. +""" + +import re +from typing import Optional +from .stats import ContentStats + + +class ContentParser: + """Parser for extracting content from MarkdownMatters documents.""" + + def extract_content(self, text: str) -> str: + """ + Extract main content without frontmatter and tailmatter. + + Args: + text: Full markdown document text + + Returns: + Content without frontmatter and tailmatter zones + """ + # Remove frontmatter + content = self._remove_frontmatter(text) + + # Remove tailmatter + content = self._remove_tailmatter(content) + + return content.strip() + + def calculate_stats(self, content: str) -> ContentStats: + """ + Calculate statistics for content. + + Args: + content: The content text to analyze + + Returns: + ContentStats object with calculated statistics + """ + # Count lines + lines = content.split('\n') + line_count = len(lines) + + # Count words (split by whitespace) + words = content.split() + word_count = len(words) + + # Count paragraphs (non-empty text blocks separated by blank lines) + paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()] + paragraph_count = len(paragraphs) + + # Count characters + character_count = len(content) + + return ContentStats( + word_count=word_count, + line_count=line_count, + paragraph_count=paragraph_count, + character_count=character_count + ) + + def _remove_frontmatter(self, text: str) -> str: + """Remove YAML/TOML/JSON frontmatter from text.""" + # Pattern for YAML frontmatter (---...---) + yaml_pattern = r'^---\s*\n.*?\n---\s*\n' + + # Remove YAML frontmatter if present + text = re.sub(yaml_pattern, '', text, flags=re.DOTALL | re.MULTILINE) + + # TODO: Add support for TOML and JSON frontmatter in future cycles + + return text + + def _remove_tailmatter(self, text: str) -> str: + """Remove tailmatter blocks from text.""" + # Pattern for tailmatter: ```yaml tailmatter or ```json tailmatter + # Usually preceded by horizontal rule (---) + + # Look for the pattern: --- followed by ```yaml tailmatter or ```json tailmatter + tailmatter_pattern = r'\n---\s*\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$' + + # Remove tailmatter if present + text = re.sub(tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE) + + # Also handle cases where tailmatter is at the end without preceding --- + simple_tailmatter_pattern = r'\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$' + text = re.sub(simple_tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE) + + return text \ No newline at end of file diff --git a/markitect/content/stats.py b/markitect/content/stats.py new file mode 100644 index 00000000..17f6635e --- /dev/null +++ b/markitect/content/stats.py @@ -0,0 +1,25 @@ +""" +Content statistics data structures. +""" + +from dataclasses import dataclass +from typing import Dict, Any + + +@dataclass +class ContentStats: + """Statistics about markdown content.""" + + word_count: int + line_count: int + paragraph_count: int + character_count: int + + def to_dict(self) -> Dict[str, Any]: + """Convert stats to dictionary.""" + return { + "word_count": self.word_count, + "line_count": self.line_count, + "paragraph_count": self.paragraph_count, + "character_count": self.character_count + } \ No newline at end of file diff --git a/tests/fixtures/content_test_files/complete_document.md b/tests/fixtures/content_test_files/complete_document.md new file mode 100644 index 00000000..372d655e --- /dev/null +++ b/tests/fixtures/content_test_files/complete_document.md @@ -0,0 +1,43 @@ +--- +title: "Complete Test Document" +author: "Test Author" +date: 2025-10-02 +tags: ["test", "markdown", "matters"] +--- + +# Complete Test Document + +This is the main content of the document. It contains multiple paragraphs and various elements to test content extraction. + +Author: John Doe +Project: MarkdownMatters Implementation +Status: In Progress + +## Section 1 + +Here is some content in the first section. This paragraph contains exactly twenty-five words to help with word counting tests. + +## Section 2 + +Another section with different content. This helps test paragraph counting and ensures that the content parser works correctly across multiple sections. + +The final paragraph of the main content area. + +--- + +```yaml tailmatter +qa_checklist: + - requirement: "All headers verified" + complete: true + - requirement: "Links checked" + complete: false + +editorial: + status: "In Review" + reviewer: "jane.doe" + version: 1.2 + +agent_config: + role: "documentation_reviewer" + access_scope: "content" +``` \ No newline at end of file diff --git a/tests/fixtures/content_test_files/contentmatter_inline.md b/tests/fixtures/content_test_files/contentmatter_inline.md new file mode 100644 index 00000000..a3fedaa9 --- /dev/null +++ b/tests/fixtures/content_test_files/contentmatter_inline.md @@ -0,0 +1,21 @@ +# Document with Contentmatter + +This document contains MultiMarkdown key-value pairs within the content body. + +Author: Jane Smith +Project: Content Testing +Keywords: markdown, contentmatter, testing + +## Introduction + +This section demonstrates contentmatter usage. The key-value pairs above are part of the content but provide metadata. + +Reference: https://example.com/docs +Version: 2.1 +License: MIT + +The content continues here with more text for testing purposes. This paragraph helps verify that contentmatter is preserved in content extraction. + +## Conclusion + +Final section with summary content. Word counting should include the contentmatter lines as part of the content. \ No newline at end of file diff --git a/tests/fixtures/content_test_files/frontmatter_only.md b/tests/fixtures/content_test_files/frontmatter_only.md new file mode 100644 index 00000000..c63a49bc --- /dev/null +++ b/tests/fixtures/content_test_files/frontmatter_only.md @@ -0,0 +1,15 @@ +--- +title: "Frontmatter Only Document" +author: "Test Author" +date: 2025-10-02 +--- + +# Frontmatter Only Document + +This document only has frontmatter, no tailmatter. The content should be extracted without the frontmatter block. + +This is a simple paragraph for testing. It has exactly twelve words for counting purposes. + +## Simple Section + +Another paragraph here. This helps test the content extraction when only frontmatter is present. \ No newline at end of file diff --git a/tests/fixtures/content_test_files/plain_markdown.md b/tests/fixtures/content_test_files/plain_markdown.md new file mode 100644 index 00000000..f728e834 --- /dev/null +++ b/tests/fixtures/content_test_files/plain_markdown.md @@ -0,0 +1,13 @@ +# Plain Markdown Document + +This is a simple markdown document without any frontmatter or tailmatter. Just pure content. + +This paragraph contains exactly fifteen words for testing the word counting functionality of the parser. + +## Section One + +Another section with regular content. This helps test the basic content extraction without any matter zones. + +## Section Two + +The final section with some more content. Multiple paragraphs help test paragraph counting and line counting features. \ No newline at end of file diff --git a/tests/fixtures/content_test_files/tailmatter_only.md b/tests/fixtures/content_test_files/tailmatter_only.md new file mode 100644 index 00000000..928f27bf --- /dev/null +++ b/tests/fixtures/content_test_files/tailmatter_only.md @@ -0,0 +1,19 @@ +# Tailmatter Only Document + +This document only has tailmatter, no frontmatter. The content should be extracted without the tailmatter block. + +This is a test paragraph. It contains exactly ten words for counting purposes. + +Another paragraph for testing content extraction with tailmatter present but no frontmatter. + +--- + +```yaml tailmatter +qa_checklist: + - requirement: "Document structure validated" + complete: true + +editorial: + status: "Draft" + reviewer: "test.reviewer" +``` \ No newline at end of file diff --git a/tests/test_content_commands.py b/tests/test_content_commands.py new file mode 100644 index 00000000..3931e886 --- /dev/null +++ b/tests/test_content_commands.py @@ -0,0 +1,296 @@ +""" +TDD8 Cycle 1: Content Commands Tests (RED Phase) +Issue #38 - MarkdownMatters CLI Implementation + +This test file implements the RED phase tests for content command family: +- markitect content-get [path] - Extract content without frontmatter/tailmatter +- markitect content-stats [path] - Content statistics + +Following TDD8 methodology, these tests MUST FAIL initially. +""" + +import pytest +import tempfile +import os +from pathlib import Path +from click.testing import CliRunner + +from markitect.content.parser import ContentParser +from markitect.content.stats import ContentStats +from markitect.content.commands import content_get, content_stats + + +class TestContentExtraction: + """Test content extraction without matter zones.""" + + @pytest.fixture + def test_files_dir(self): + """Path to test fixture files.""" + return Path(__file__).parent / "fixtures" / "content_test_files" + + @pytest.fixture + def content_parser(self): + """Content parser instance.""" + return ContentParser() + + def test_content_get_extracts_content_without_frontmatter(self, content_parser, test_files_dir): + """Test that content extraction removes frontmatter.""" + file_path = test_files_dir / "frontmatter_only.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + + # Content should not contain frontmatter delimiters or YAML + assert "---" not in content + assert "title:" not in content + assert "author:" not in content + assert "date:" not in content + + # Content should contain the actual document content + assert "# Frontmatter Only Document" in content + assert "This document only has frontmatter" in content + + def test_content_get_extracts_content_without_tailmatter(self, content_parser, test_files_dir): + """Test that content extraction removes tailmatter.""" + file_path = test_files_dir / "tailmatter_only.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + + # Content should not contain tailmatter blocks + assert "```yaml tailmatter" not in content + assert "qa_checklist:" not in content + assert "editorial:" not in content + + # Content should contain the actual document content + assert "# Tailmatter Only Document" in content + assert "This document only has tailmatter" in content + + def test_content_get_extracts_content_without_both_matters(self, content_parser, test_files_dir): + """Test that content extraction removes both frontmatter and tailmatter.""" + file_path = test_files_dir / "complete_document.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + + # Content should not contain any matter zones + assert "---" not in content or content.count("---") <= 1 # Allow section dividers + assert "title:" not in content + assert "```yaml tailmatter" not in content + assert "qa_checklist:" not in content + + # Content should contain the main document content + assert "# Complete Test Document" in content + assert "This is the main content" in content + assert "## Section 1" in content + + def test_content_get_preserves_contentmatter_inline_metadata(self, content_parser, test_files_dir): + """Test that contentmatter (MMD key-value pairs) are preserved in content.""" + file_path = test_files_dir / "contentmatter_inline.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + + # Contentmatter should be preserved as it's part of the content + assert "Author: Jane Smith" in content + assert "Project: Content Testing" in content + assert "Keywords: markdown, contentmatter, testing" in content + assert "Reference: https://example.com/docs" in content + + def test_content_get_handles_file_not_found(self, content_parser): + """Test proper error handling for non-existent files.""" + with pytest.raises(FileNotFoundError): + with open("non_existent_file.md", 'r') as f: + text = f.read() + content_parser.extract_content(text) + + +class TestContentStatistics: + """Test content statistics calculation.""" + + @pytest.fixture + def test_files_dir(self): + """Path to test fixture files.""" + return Path(__file__).parent / "fixtures" / "content_test_files" + + @pytest.fixture + def content_parser(self): + """Content parser instance.""" + return ContentParser() + + def test_content_stats_counts_words_correctly(self, content_parser, test_files_dir): + """Test accurate word counting in content.""" + file_path = test_files_dir / "plain_markdown.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + stats = content_parser.calculate_stats(content) + + # Should count words in content (exact count depends on test file) + assert stats.word_count > 0 + assert isinstance(stats.word_count, int) + + def test_content_stats_counts_paragraphs_correctly(self, content_parser, test_files_dir): + """Test accurate paragraph counting.""" + file_path = test_files_dir / "plain_markdown.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + stats = content_parser.calculate_stats(content) + + # Should count paragraphs (non-empty text blocks) + assert stats.paragraph_count > 0 + assert isinstance(stats.paragraph_count, int) + + def test_content_stats_counts_lines_correctly(self, content_parser, test_files_dir): + """Test accurate line counting.""" + file_path = test_files_dir / "plain_markdown.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + stats = content_parser.calculate_stats(content) + + # Should count lines in content + assert stats.line_count > 0 + assert isinstance(stats.line_count, int) + + def test_content_stats_excludes_frontmatter_from_counts(self, content_parser, test_files_dir): + """Test that frontmatter is excluded from statistics.""" + file_path = test_files_dir / "frontmatter_only.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + stats = content_parser.calculate_stats(content) + + # Word count should not include frontmatter words + # This requires manual calculation based on test file content + assert "title:" not in content + assert stats.word_count > 0 # Should still have content words + + def test_content_stats_excludes_tailmatter_from_counts(self, content_parser, test_files_dir): + """Test that tailmatter is excluded from statistics.""" + file_path = test_files_dir / "tailmatter_only.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + stats = content_parser.calculate_stats(content) + + # Word count should not include tailmatter words + assert "qa_checklist:" not in content + assert stats.word_count > 0 # Should still have content words + + def test_content_stats_includes_contentmatter_in_counts(self, content_parser, test_files_dir): + """Test that contentmatter (MMD) is included in statistics.""" + file_path = test_files_dir / "contentmatter_inline.md" + + with open(file_path, 'r') as f: + text = f.read() + + content = content_parser.extract_content(text) + stats = content_parser.calculate_stats(content) + + # Should include contentmatter key-value pairs in word count + assert "Author: Jane Smith" in content + assert stats.word_count > 10 # Should include contentmatter words + + +class TestCLIIntegration: + """Test CLI command integration.""" + + @pytest.fixture + def runner(self): + """CLI test runner.""" + return CliRunner() + + @pytest.fixture + def test_files_dir(self): + """Path to test fixture files.""" + return Path(__file__).parent / "fixtures" / "content_test_files" + + def test_content_get_cli_command_works(self, runner, test_files_dir): + """Test that content-get CLI command executes successfully.""" + file_path = test_files_dir / "plain_markdown.md" + + result = runner.invoke(content_get, ['--file', str(file_path)]) + + assert result.exit_code == 0 + assert "Plain Markdown Document" in result.output + # Should not contain frontmatter/tailmatter markers + assert "---" not in result.output or result.output.count("---") <= 1 + + def test_content_stats_cli_command_works(self, runner, test_files_dir): + """Test that content-stats CLI command executes successfully.""" + file_path = test_files_dir / "plain_markdown.md" + + result = runner.invoke(content_stats, ['--file', str(file_path)]) + + assert result.exit_code == 0 + assert "word_count" in result.output + assert "line_count" in result.output + assert "paragraph_count" in result.output + + def test_content_commands_help_text_available(self, runner): + """Test that help text is available for content commands.""" + # Test content-get help + result = runner.invoke(content_get, ['--help']) + assert result.exit_code == 0 + assert "Extract content without frontmatter and tailmatter" in result.output + + # Test content-stats help + result = runner.invoke(content_stats, ['--help']) + assert result.exit_code == 0 + assert "Calculate content statistics" in result.output + + +class TestContentStats: + """Test ContentStats data class.""" + + def test_content_stats_creation(self): + """Test ContentStats object creation.""" + stats = ContentStats( + word_count=100, + line_count=20, + paragraph_count=5, + character_count=500 + ) + + assert stats.word_count == 100 + assert stats.line_count == 20 + assert stats.paragraph_count == 5 + assert stats.character_count == 500 + + def test_content_stats_to_dict(self): + """Test ContentStats conversion to dictionary.""" + stats = ContentStats( + word_count=100, + line_count=20, + paragraph_count=5, + character_count=500 + ) + + stats_dict = stats.to_dict() + + assert stats_dict == { + "word_count": 100, + "line_count": 20, + "paragraph_count": 5, + "character_count": 500 + } \ No newline at end of file