chore: Issue closure 125 cleanup

2025-10-05 12:49:28 +02:00
parent 20e7f0f5bd
commit bce680e6cb
26 changed files with 2362 additions and 388 deletions
--- a/capabilities/markitect-content/README.md
+++ b/capabilities/markitect-content/README.md
@@ -0,0 +1,104 @@
+# MarkiTect Content Capability
+
+A self-contained capability for parsing and analyzing MarkdownMatters content without frontmatter and tailmatter zones.
+
+## Overview
+
+The markitect-content capability provides content extraction and statistics functionality for MarkdownMatters documents. It cleanly separates main document content from metadata zones (frontmatter/tailmatter) and provides comprehensive content analysis.
+
+## Features
+
+- **Content Extraction**: Extract main markdown content without frontmatter/tailmatter zones
+- **Content Statistics**: Calculate word count, line count, paragraph count, and character count
+- **CLI Commands**: Direct command-line access to content operations
+- **Contentmatter Preservation**: Preserves inline metadata (MMD key-value pairs) as part of content
+
+## API
+
+### Core Classes
+
+#### `ContentParser`
+Main parser class for content extraction and analysis.
+
+```python
+from markitect_content import ContentParser
+
+parser = ContentParser()
+
+# Extract content without matter zones
+content = parser.extract_content(text)
+
+# Calculate content statistics
+stats = parser.calculate_stats(content)
+```
+
+#### `ContentStats`
+Statistics data structure with content metrics.
+
+```python
+from markitect_content import ContentStats
+
+# Stats object contains:
+# - word_count: int
+# - line_count: int
+# - paragraph_count: int
+# - character_count: int
+
+# Convert to dictionary
+stats_dict = stats.to_dict()
+```
+
+### CLI Commands
+
+#### `content-get`
+Extract content without frontmatter and tailmatter.
+
+```bash
+markitect content-get --file document.md
+```
+
+#### `content-stats`
+Calculate content statistics.
+
+```bash
+markitect content-stats --file document.md --format json
+markitect content-stats --file document.md --format text
+```
+
+## Content Processing Rules
+
+1. **Frontmatter Removal**: Removes YAML frontmatter blocks (`---...---`)
+2. **Tailmatter Removal**: Removes tailmatter blocks (````yaml tailmatter...````)
+3. **Contentmatter Preservation**: Keeps inline MMD key-value pairs
+4. **Content Statistics**: Counts are calculated on cleaned content only
+
+## Installation
+
+Install as an editable dependency in your MarkiTect environment:
+
+```bash
+pip install -e capabilities/markitect-content/
+```
+
+## Testing
+
+Run the capability test suite:
+
+```bash
+cd capabilities/markitect-content/
+pytest tests/
+```
+
+## Compliance
+
+This capability follows the ComposableRepositoryParadigm:
+- ✅ Src layout (PEP 660 compliant)
+- ✅ Unidirectional dependencies
+- ✅ Self-contained with own tests
+- ✅ Independent configuration
+- ✅ Clean API boundaries
+
+## Dependencies
+
+- click>=8.0.0 (for CLI commands)
+- pytest>=7.0.0 (dev dependency for testing)
--- a/capabilities/markitect-content/src/markitect_content/init.py
+++ b/capabilities/markitect-content/src/markitect_content/init.py
@@ -0,0 +1,9 @@
+"""
+Content module for MarkdownMatters CLI.
+Handles content extraction without frontmatter and tailmatter zones.
+"""
+
+from .parser import ContentParser
+from .stats import ContentStats
+
+__all__ = ['ContentParser', 'ContentStats']
--- a/capabilities/markitect-content/src/markitect_content/commands.py
+++ b/capabilities/markitect-content/src/markitect_content/commands.py
@@ -0,0 +1,57 @@
+"""
+CLI commands for content operations.
+"""
+
+import click
+import json
+from pathlib import Path
+from .parser import ContentParser
+
+
+@click.command('content-get')
+@click.option('--file', 'file_path', required=True, type=click.Path(exists=True),
+              help='Path to markdown file')
+def content_get(file_path):
+    """Extract content without frontmatter and tailmatter."""
+    try:
+        file_path = Path(file_path)
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+
+        parser = ContentParser()
+        content = parser.extract_content(text)
+
+        click.echo(content)
+
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        raise click.ClickException(f"Failed to extract content from {file_path}")
+
+
+@click.command('content-stats')
+@click.option('--file', 'file_path', required=True, type=click.Path(exists=True),
+              help='Path to markdown file')
+@click.option('--format', 'output_format', default='json', type=click.Choice(['json', 'text']),
+              help='Output format (json or text)')
+def content_stats(file_path, output_format):
+    """Calculate content statistics."""
+    try:
+        file_path = Path(file_path)
+        with open(file_path, 'r', encoding='utf-8') as f:
+            text = f.read()
+
+        parser = ContentParser()
+        content = parser.extract_content(text)
+        stats = parser.calculate_stats(content)
+
+        if output_format == 'json':
+            click.echo(json.dumps(stats.to_dict(), indent=2))
+        else:
+            click.echo(f"Word count: {stats.word_count}")
+            click.echo(f"Line count: {stats.line_count}")
+            click.echo(f"Paragraph count: {stats.paragraph_count}")
+            click.echo(f"Character count: {stats.character_count}")
+
+    except Exception as e:
+        click.echo(f"Error: {e}", err=True)
+        raise click.ClickException(f"Failed to calculate stats for {file_path}")
--- a/capabilities/markitect-content/src/markitect_content/parser.py
+++ b/capabilities/markitect-content/src/markitect_content/parser.py
@@ -0,0 +1,90 @@
+"""
+Content parser for extracting markdown content without matter zones.
+"""
+
+import re
+from typing import Optional
+from .stats import ContentStats
+
+
+class ContentParser:
+    """Parser for extracting content from MarkdownMatters documents."""
+
+    def extract_content(self, text: str) -> str:
+        """
+        Extract main content without frontmatter and tailmatter.
+
+        Args:
+            text: Full markdown document text
+
+        Returns:
+            Content without frontmatter and tailmatter zones
+        """
+        # Remove frontmatter
+        content = self._remove_frontmatter(text)
+
+        # Remove tailmatter
+        content = self._remove_tailmatter(content)
+
+        return content.strip()
+
+    def calculate_stats(self, content: str) -> ContentStats:
+        """
+        Calculate statistics for content.
+
+        Args:
+            content: The content text to analyze
+
+        Returns:
+            ContentStats object with calculated statistics
+        """
+        # Count lines
+        lines = content.split('\n')
+        line_count = len(lines)
+
+        # Count words (split by whitespace)
+        words = content.split()
+        word_count = len(words)
+
+        # Count paragraphs (non-empty text blocks separated by blank lines)
+        paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
+        paragraph_count = len(paragraphs)
+
+        # Count characters
+        character_count = len(content)
+
+        return ContentStats(
+            word_count=word_count,
+            line_count=line_count,
+            paragraph_count=paragraph_count,
+            character_count=character_count
+        )
+
+    def _remove_frontmatter(self, text: str) -> str:
+        """Remove YAML/TOML/JSON frontmatter from text."""
+        # Pattern for YAML frontmatter (---...---)
+        yaml_pattern = r'^---\s*\n.*?\n---\s*\n'
+
+        # Remove YAML frontmatter if present
+        text = re.sub(yaml_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
+
+        # TODO: Add support for TOML and JSON frontmatter in future cycles
+
+        return text
+
+    def _remove_tailmatter(self, text: str) -> str:
+        """Remove tailmatter blocks from text."""
+        # Pattern for tailmatter: ```yaml tailmatter or ```json tailmatter
+        # Usually preceded by horizontal rule (---)
+
+        # Look for the pattern: --- followed by ```yaml tailmatter or ```json tailmatter
+        tailmatter_pattern = r'\n---\s*\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$'
+
+        # Remove tailmatter if present
+        text = re.sub(tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
+
+        # Also handle cases where tailmatter is at the end without preceding ---
+        simple_tailmatter_pattern = r'\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$'
+        text = re.sub(simple_tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
+
+        return text
--- a/capabilities/markitect-content/src/markitect_content/stats.py
+++ b/capabilities/markitect-content/src/markitect_content/stats.py
@@ -0,0 +1,25 @@
+"""
+Content statistics data structures.
+"""
+
+from dataclasses import dataclass
+from typing import Dict, Any
+
+
+@dataclass
+class ContentStats:
+    """Statistics about markdown content."""
+
+    word_count: int
+    line_count: int
+    paragraph_count: int
+    character_count: int
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert stats to dictionary."""
+        return {
+            "word_count": self.word_count,
+            "line_count": self.line_count,
+            "paragraph_count": self.paragraph_count,
+            "character_count": self.character_count
+        }
--- a/capabilities/markitect-content/tests/fixtures/content_test_files/complete_document.md
+++ b/capabilities/markitect-content/tests/fixtures/content_test_files/complete_document.md
@@ -0,0 +1,43 @@
+---
+title: "Complete Test Document"
+author: "Test Author"
+date: 2025-10-02
+tags: ["test", "markdown", "matters"]
+---
+
+# Complete Test Document
+
+This is the main content of the document. It contains multiple paragraphs and various elements to test content extraction.
+
+Author: John Doe
+Project: MarkdownMatters Implementation
+Status: In Progress
+
+## Section 1
+
+Here is some content in the first section. This paragraph contains exactly twenty-five words to help with word counting tests.
+
+## Section 2
+
+Another section with different content. This helps test paragraph counting and ensures that the content parser works correctly across multiple sections.
+
+The final paragraph of the main content area.
+
+---
+
+```yaml tailmatter
+qa_checklist:
+  - requirement: "All headers verified"
+    complete: true
+  - requirement: "Links checked"
+    complete: false
+
+editorial:
+  status: "In Review"
+  reviewer: "jane.doe"
+  version: 1.2
+
+agent_config:
+  role: "documentation_reviewer"
+  access_scope: "content"
+```
--- a/capabilities/markitect-content/tests/fixtures/content_test_files/contentmatter_inline.md
+++ b/capabilities/markitect-content/tests/fixtures/content_test_files/contentmatter_inline.md
@@ -0,0 +1,21 @@
+# Document with Contentmatter
+
+This document contains MultiMarkdown key-value pairs within the content body.
+
+Author: Jane Smith
+Project: Content Testing
+Keywords: markdown, contentmatter, testing
+
+## Introduction
+
+This section demonstrates contentmatter usage. The key-value pairs above are part of the content but provide metadata.
+
+Reference: https://example.com/docs
+Version: 2.1
+License: MIT
+
+The content continues here with more text for testing purposes. This paragraph helps verify that contentmatter is preserved in content extraction.
+
+## Conclusion
+
+Final section with summary content. Word counting should include the contentmatter lines as part of the content.
--- a/capabilities/markitect-content/tests/fixtures/content_test_files/frontmatter_only.md
+++ b/capabilities/markitect-content/tests/fixtures/content_test_files/frontmatter_only.md
@@ -0,0 +1,15 @@
+---
+title: "Frontmatter Only Document"
+author: "Test Author"
+date: 2025-10-02
+---
+
+# Frontmatter Only Document
+
+This document only has frontmatter, no tailmatter. The content should be extracted without the frontmatter block.
+
+This is a simple paragraph for testing. It has exactly twelve words for counting purposes.
+
+## Simple Section
+
+Another paragraph here. This helps test the content extraction when only frontmatter is present.
--- a/capabilities/markitect-content/tests/fixtures/content_test_files/plain_markdown.md
+++ b/capabilities/markitect-content/tests/fixtures/content_test_files/plain_markdown.md
@@ -0,0 +1,13 @@
+# Plain Markdown Document
+
+This is a simple markdown document without any frontmatter or tailmatter. Just pure content.
+
+This paragraph contains exactly fifteen words for testing the word counting functionality of the parser.
+
+## Section One
+
+Another section with regular content. This helps test the basic content extraction without any matter zones.
+
+## Section Two
+
+The final section with some more content. Multiple paragraphs help test paragraph counting and line counting features.
--- a/capabilities/markitect-content/tests/fixtures/content_test_files/tailmatter_only.md
+++ b/capabilities/markitect-content/tests/fixtures/content_test_files/tailmatter_only.md
@@ -0,0 +1,19 @@
+# Tailmatter Only Document
+
+This document only has tailmatter, no frontmatter. The content should be extracted without the tailmatter block.
+
+This is a test paragraph. It contains exactly ten words for counting purposes.
+
+Another paragraph for testing content extraction with tailmatter present but no frontmatter.
+
+---
+
+```yaml tailmatter
+qa_checklist:
+  - requirement: "Document structure validated"
+    complete: true
+
+editorial:
+  status: "Draft"
+  reviewer: "test.reviewer"
+```
--- a/capabilities/markitect-content/tests/test_content_commands.py
+++ b/capabilities/markitect-content/tests/test_content_commands.py
@@ -0,0 +1,296 @@
+"""
+TDD8 Cycle 1: Content Commands Tests (RED Phase)
+Issue #38 - MarkdownMatters CLI Implementation
+
+This test file implements the RED phase tests for content command family:
+- markitect content-get [path] - Extract content without frontmatter/tailmatter
+- markitect content-stats [path] - Content statistics
+
+Following TDD8 methodology, these tests MUST FAIL initially.
+"""
+
+import pytest
+import tempfile
+import os
+from pathlib import Path
+from click.testing import CliRunner
+
+from markitect_content.parser import ContentParser
+from markitect_content.stats import ContentStats
+from markitect_content.commands import content_get, content_stats
+
+
+class TestContentExtraction:
+    """Test content extraction without matter zones."""
+
+    @pytest.fixture
+    def test_files_dir(self):
+        """Path to test fixture files."""
+        return Path(__file__).parent / "fixtures" / "content_test_files"
+
+    @pytest.fixture
+    def content_parser(self):
+        """Content parser instance."""
+        return ContentParser()
+
+    def test_content_get_extracts_content_without_frontmatter(self, content_parser, test_files_dir):
+        """Test that content extraction removes frontmatter."""
+        file_path = test_files_dir / "frontmatter_only.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+
+        # Content should not contain frontmatter delimiters or YAML
+        assert "---" not in content
+        assert "title:" not in content
+        assert "author:" not in content
+        assert "date:" not in content
+
+        # Content should contain the actual document content
+        assert "# Frontmatter Only Document" in content
+        assert "This document only has frontmatter" in content
+
+    def test_content_get_extracts_content_without_tailmatter(self, content_parser, test_files_dir):
+        """Test that content extraction removes tailmatter."""
+        file_path = test_files_dir / "tailmatter_only.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+
+        # Content should not contain tailmatter blocks
+        assert "```yaml tailmatter" not in content
+        assert "qa_checklist:" not in content
+        assert "editorial:" not in content
+
+        # Content should contain the actual document content
+        assert "# Tailmatter Only Document" in content
+        assert "This document only has tailmatter" in content
+
+    def test_content_get_extracts_content_without_both_matters(self, content_parser, test_files_dir):
+        """Test that content extraction removes both frontmatter and tailmatter."""
+        file_path = test_files_dir / "complete_document.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+
+        # Content should not contain any matter zones
+        assert "---" not in content or content.count("---") <= 1  # Allow section dividers
+        assert "title:" not in content
+        assert "```yaml tailmatter" not in content
+        assert "qa_checklist:" not in content
+
+        # Content should contain the main document content
+        assert "# Complete Test Document" in content
+        assert "This is the main content" in content
+        assert "## Section 1" in content
+
+    def test_content_get_preserves_contentmatter_inline_metadata(self, content_parser, test_files_dir):
+        """Test that contentmatter (MMD key-value pairs) are preserved in content."""
+        file_path = test_files_dir / "contentmatter_inline.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+
+        # Contentmatter should be preserved as it's part of the content
+        assert "Author: Jane Smith" in content
+        assert "Project: Content Testing" in content
+        assert "Keywords: markdown, contentmatter, testing" in content
+        assert "Reference: https://example.com/docs" in content
+
+    def test_content_get_handles_file_not_found(self, content_parser):
+        """Test proper error handling for non-existent files."""
+        with pytest.raises(FileNotFoundError):
+            with open("non_existent_file.md", 'r') as f:
+                text = f.read()
+            content_parser.extract_content(text)
+
+
+class TestContentStatistics:
+    """Test content statistics calculation."""
+
+    @pytest.fixture
+    def test_files_dir(self):
+        """Path to test fixture files."""
+        return Path(__file__).parent / "fixtures" / "content_test_files"
+
+    @pytest.fixture
+    def content_parser(self):
+        """Content parser instance."""
+        return ContentParser()
+
+    def test_content_stats_counts_words_correctly(self, content_parser, test_files_dir):
+        """Test accurate word counting in content."""
+        file_path = test_files_dir / "plain_markdown.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+        stats = content_parser.calculate_stats(content)
+
+        # Should count words in content (exact count depends on test file)
+        assert stats.word_count > 0
+        assert isinstance(stats.word_count, int)
+
+    def test_content_stats_counts_paragraphs_correctly(self, content_parser, test_files_dir):
+        """Test accurate paragraph counting."""
+        file_path = test_files_dir / "plain_markdown.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+        stats = content_parser.calculate_stats(content)
+
+        # Should count paragraphs (non-empty text blocks)
+        assert stats.paragraph_count > 0
+        assert isinstance(stats.paragraph_count, int)
+
+    def test_content_stats_counts_lines_correctly(self, content_parser, test_files_dir):
+        """Test accurate line counting."""
+        file_path = test_files_dir / "plain_markdown.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+        stats = content_parser.calculate_stats(content)
+
+        # Should count lines in content
+        assert stats.line_count > 0
+        assert isinstance(stats.line_count, int)
+
+    def test_content_stats_excludes_frontmatter_from_counts(self, content_parser, test_files_dir):
+        """Test that frontmatter is excluded from statistics."""
+        file_path = test_files_dir / "frontmatter_only.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+        stats = content_parser.calculate_stats(content)
+
+        # Word count should not include frontmatter words
+        # This requires manual calculation based on test file content
+        assert "title:" not in content
+        assert stats.word_count > 0  # Should still have content words
+
+    def test_content_stats_excludes_tailmatter_from_counts(self, content_parser, test_files_dir):
+        """Test that tailmatter is excluded from statistics."""
+        file_path = test_files_dir / "tailmatter_only.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+        stats = content_parser.calculate_stats(content)
+
+        # Word count should not include tailmatter words
+        assert "qa_checklist:" not in content
+        assert stats.word_count > 0  # Should still have content words
+
+    def test_content_stats_includes_contentmatter_in_counts(self, content_parser, test_files_dir):
+        """Test that contentmatter (MMD) is included in statistics."""
+        file_path = test_files_dir / "contentmatter_inline.md"
+
+        with open(file_path, 'r') as f:
+            text = f.read()
+
+        content = content_parser.extract_content(text)
+        stats = content_parser.calculate_stats(content)
+
+        # Should include contentmatter key-value pairs in word count
+        assert "Author: Jane Smith" in content
+        assert stats.word_count > 10  # Should include contentmatter words
+
+
+class TestCLIIntegration:
+    """Test CLI command integration."""
+
+    @pytest.fixture
+    def runner(self):
+        """CLI test runner."""
+        return CliRunner()
+
+    @pytest.fixture
+    def test_files_dir(self):
+        """Path to test fixture files."""
+        return Path(__file__).parent / "fixtures" / "content_test_files"
+
+    def test_content_get_cli_command_works(self, runner, test_files_dir):
+        """Test that content-get CLI command executes successfully."""
+        file_path = test_files_dir / "plain_markdown.md"
+
+        result = runner.invoke(content_get, ['--file', str(file_path)])
+
+        assert result.exit_code == 0
+        assert "Plain Markdown Document" in result.output
+        # Should not contain frontmatter/tailmatter markers
+        assert "---" not in result.output or result.output.count("---") <= 1
+
+    def test_content_stats_cli_command_works(self, runner, test_files_dir):
+        """Test that content-stats CLI command executes successfully."""
+        file_path = test_files_dir / "plain_markdown.md"
+
+        result = runner.invoke(content_stats, ['--file', str(file_path)])
+
+        assert result.exit_code == 0
+        assert "word_count" in result.output
+        assert "line_count" in result.output
+        assert "paragraph_count" in result.output
+
+    def test_content_commands_help_text_available(self, runner):
+        """Test that help text is available for content commands."""
+        # Test content-get help
+        result = runner.invoke(content_get, ['--help'])
+        assert result.exit_code == 0
+        assert "Extract content without frontmatter and tailmatter" in result.output
+
+        # Test content-stats help
+        result = runner.invoke(content_stats, ['--help'])
+        assert result.exit_code == 0
+        assert "Calculate content statistics" in result.output
+
+
+class TestContentStats:
+    """Test ContentStats data class."""
+
+    def test_content_stats_creation(self):
+        """Test ContentStats object creation."""
+        stats = ContentStats(
+            word_count=100,
+            line_count=20,
+            paragraph_count=5,
+            character_count=500
+        )
+
+        assert stats.word_count == 100
+        assert stats.line_count == 20
+        assert stats.paragraph_count == 5
+        assert stats.character_count == 500
+
+    def test_content_stats_to_dict(self):
+        """Test ContentStats conversion to dictionary."""
+        stats = ContentStats(
+            word_count=100,
+            line_count=20,
+            paragraph_count=5,
+            character_count=500
+        )
+
+        stats_dict = stats.to_dict()
+
+        assert stats_dict == {
+            "word_count": 100,
+            "line_count": 20,
+            "paragraph_count": 5,
+            "character_count": 500
+        }