chore: Issue closure 125 cleanup

This commit is contained in:
2025-10-05 12:49:28 +02:00
parent 20e7f0f5bd
commit bce680e6cb
26 changed files with 2362 additions and 388 deletions

View File

@@ -0,0 +1,104 @@
# MarkiTect Content Capability
A self-contained capability for parsing and analyzing MarkdownMatters content without frontmatter and tailmatter zones.
## Overview
The markitect-content capability provides content extraction and statistics functionality for MarkdownMatters documents. It cleanly separates main document content from metadata zones (frontmatter/tailmatter) and provides comprehensive content analysis.
## Features
- **Content Extraction**: Extract main markdown content without frontmatter/tailmatter zones
- **Content Statistics**: Calculate word count, line count, paragraph count, and character count
- **CLI Commands**: Direct command-line access to content operations
- **Contentmatter Preservation**: Preserves inline metadata (MMD key-value pairs) as part of content
## API
### Core Classes
#### `ContentParser`
Main parser class for content extraction and analysis.
```python
from markitect_content import ContentParser
parser = ContentParser()
# Extract content without matter zones
content = parser.extract_content(text)
# Calculate content statistics
stats = parser.calculate_stats(content)
```
#### `ContentStats`
Statistics data structure with content metrics.
```python
from markitect_content import ContentStats
# Stats object contains:
# - word_count: int
# - line_count: int
# - paragraph_count: int
# - character_count: int
# Convert to dictionary
stats_dict = stats.to_dict()
```
### CLI Commands
#### `content-get`
Extract content without frontmatter and tailmatter.
```bash
markitect content-get --file document.md
```
#### `content-stats`
Calculate content statistics.
```bash
markitect content-stats --file document.md --format json
markitect content-stats --file document.md --format text
```
## Content Processing Rules
1. **Frontmatter Removal**: Removes YAML frontmatter blocks (`---...---`)
2. **Tailmatter Removal**: Removes tailmatter blocks (````yaml tailmatter...````)
3. **Contentmatter Preservation**: Keeps inline MMD key-value pairs
4. **Content Statistics**: Counts are calculated on cleaned content only
## Installation
Install as an editable dependency in your MarkiTect environment:
```bash
pip install -e capabilities/markitect-content/
```
## Testing
Run the capability test suite:
```bash
cd capabilities/markitect-content/
pytest tests/
```
## Compliance
This capability follows the ComposableRepositoryParadigm:
- ✅ Src layout (PEP 660 compliant)
- ✅ Unidirectional dependencies
- ✅ Self-contained with own tests
- ✅ Independent configuration
- ✅ Clean API boundaries
## Dependencies
- click>=8.0.0 (for CLI commands)
- pytest>=7.0.0 (dev dependency for testing)

View File

@@ -0,0 +1,9 @@
"""
Content module for MarkdownMatters CLI.
Handles content extraction without frontmatter and tailmatter zones.
"""
from .parser import ContentParser
from .stats import ContentStats
__all__ = ['ContentParser', 'ContentStats']

View File

@@ -0,0 +1,57 @@
"""
CLI commands for content operations.
"""
import click
import json
from pathlib import Path
from .parser import ContentParser
@click.command('content-get')
@click.option('--file', 'file_path', required=True, type=click.Path(exists=True),
help='Path to markdown file')
def content_get(file_path):
"""Extract content without frontmatter and tailmatter."""
try:
file_path = Path(file_path)
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
parser = ContentParser()
content = parser.extract_content(text)
click.echo(content)
except Exception as e:
click.echo(f"Error: {e}", err=True)
raise click.ClickException(f"Failed to extract content from {file_path}")
@click.command('content-stats')
@click.option('--file', 'file_path', required=True, type=click.Path(exists=True),
help='Path to markdown file')
@click.option('--format', 'output_format', default='json', type=click.Choice(['json', 'text']),
help='Output format (json or text)')
def content_stats(file_path, output_format):
"""Calculate content statistics."""
try:
file_path = Path(file_path)
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read()
parser = ContentParser()
content = parser.extract_content(text)
stats = parser.calculate_stats(content)
if output_format == 'json':
click.echo(json.dumps(stats.to_dict(), indent=2))
else:
click.echo(f"Word count: {stats.word_count}")
click.echo(f"Line count: {stats.line_count}")
click.echo(f"Paragraph count: {stats.paragraph_count}")
click.echo(f"Character count: {stats.character_count}")
except Exception as e:
click.echo(f"Error: {e}", err=True)
raise click.ClickException(f"Failed to calculate stats for {file_path}")

View File

@@ -0,0 +1,90 @@
"""
Content parser for extracting markdown content without matter zones.
"""
import re
from typing import Optional
from .stats import ContentStats
class ContentParser:
"""Parser for extracting content from MarkdownMatters documents."""
def extract_content(self, text: str) -> str:
"""
Extract main content without frontmatter and tailmatter.
Args:
text: Full markdown document text
Returns:
Content without frontmatter and tailmatter zones
"""
# Remove frontmatter
content = self._remove_frontmatter(text)
# Remove tailmatter
content = self._remove_tailmatter(content)
return content.strip()
def calculate_stats(self, content: str) -> ContentStats:
"""
Calculate statistics for content.
Args:
content: The content text to analyze
Returns:
ContentStats object with calculated statistics
"""
# Count lines
lines = content.split('\n')
line_count = len(lines)
# Count words (split by whitespace)
words = content.split()
word_count = len(words)
# Count paragraphs (non-empty text blocks separated by blank lines)
paragraphs = [p.strip() for p in content.split('\n\n') if p.strip()]
paragraph_count = len(paragraphs)
# Count characters
character_count = len(content)
return ContentStats(
word_count=word_count,
line_count=line_count,
paragraph_count=paragraph_count,
character_count=character_count
)
def _remove_frontmatter(self, text: str) -> str:
"""Remove YAML/TOML/JSON frontmatter from text."""
# Pattern for YAML frontmatter (---...---)
yaml_pattern = r'^---\s*\n.*?\n---\s*\n'
# Remove YAML frontmatter if present
text = re.sub(yaml_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
# TODO: Add support for TOML and JSON frontmatter in future cycles
return text
def _remove_tailmatter(self, text: str) -> str:
"""Remove tailmatter blocks from text."""
# Pattern for tailmatter: ```yaml tailmatter or ```json tailmatter
# Usually preceded by horizontal rule (---)
# Look for the pattern: --- followed by ```yaml tailmatter or ```json tailmatter
tailmatter_pattern = r'\n---\s*\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$'
# Remove tailmatter if present
text = re.sub(tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
# Also handle cases where tailmatter is at the end without preceding ---
simple_tailmatter_pattern = r'\n\s*```(?:yaml|json)\s+tailmatter\s*\n.*?```\s*$'
text = re.sub(simple_tailmatter_pattern, '', text, flags=re.DOTALL | re.MULTILINE)
return text

View File

@@ -0,0 +1,25 @@
"""
Content statistics data structures.
"""
from dataclasses import dataclass
from typing import Dict, Any
@dataclass
class ContentStats:
"""Statistics about markdown content."""
word_count: int
line_count: int
paragraph_count: int
character_count: int
def to_dict(self) -> Dict[str, Any]:
"""Convert stats to dictionary."""
return {
"word_count": self.word_count,
"line_count": self.line_count,
"paragraph_count": self.paragraph_count,
"character_count": self.character_count
}

View File

@@ -0,0 +1,43 @@
---
title: "Complete Test Document"
author: "Test Author"
date: 2025-10-02
tags: ["test", "markdown", "matters"]
---
# Complete Test Document
This is the main content of the document. It contains multiple paragraphs and various elements to test content extraction.
Author: John Doe
Project: MarkdownMatters Implementation
Status: In Progress
## Section 1
Here is some content in the first section. This paragraph contains exactly twenty-five words to help with word counting tests.
## Section 2
Another section with different content. This helps test paragraph counting and ensures that the content parser works correctly across multiple sections.
The final paragraph of the main content area.
---
```yaml tailmatter
qa_checklist:
- requirement: "All headers verified"
complete: true
- requirement: "Links checked"
complete: false
editorial:
status: "In Review"
reviewer: "jane.doe"
version: 1.2
agent_config:
role: "documentation_reviewer"
access_scope: "content"
```

View File

@@ -0,0 +1,21 @@
# Document with Contentmatter
This document contains MultiMarkdown key-value pairs within the content body.
Author: Jane Smith
Project: Content Testing
Keywords: markdown, contentmatter, testing
## Introduction
This section demonstrates contentmatter usage. The key-value pairs above are part of the content but provide metadata.
Reference: https://example.com/docs
Version: 2.1
License: MIT
The content continues here with more text for testing purposes. This paragraph helps verify that contentmatter is preserved in content extraction.
## Conclusion
Final section with summary content. Word counting should include the contentmatter lines as part of the content.

View File

@@ -0,0 +1,15 @@
---
title: "Frontmatter Only Document"
author: "Test Author"
date: 2025-10-02
---
# Frontmatter Only Document
This document only has frontmatter, no tailmatter. The content should be extracted without the frontmatter block.
This is a simple paragraph for testing. It has exactly twelve words for counting purposes.
## Simple Section
Another paragraph here. This helps test the content extraction when only frontmatter is present.

View File

@@ -0,0 +1,13 @@
# Plain Markdown Document
This is a simple markdown document without any frontmatter or tailmatter. Just pure content.
This paragraph contains exactly fifteen words for testing the word counting functionality of the parser.
## Section One
Another section with regular content. This helps test the basic content extraction without any matter zones.
## Section Two
The final section with some more content. Multiple paragraphs help test paragraph counting and line counting features.

View File

@@ -0,0 +1,19 @@
# Tailmatter Only Document
This document only has tailmatter, no frontmatter. The content should be extracted without the tailmatter block.
This is a test paragraph. It contains exactly ten words for counting purposes.
Another paragraph for testing content extraction with tailmatter present but no frontmatter.
---
```yaml tailmatter
qa_checklist:
- requirement: "Document structure validated"
complete: true
editorial:
status: "Draft"
reviewer: "test.reviewer"
```

View File

@@ -0,0 +1,296 @@
"""
TDD8 Cycle 1: Content Commands Tests (RED Phase)
Issue #38 - MarkdownMatters CLI Implementation
This test file implements the RED phase tests for content command family:
- markitect content-get [path] - Extract content without frontmatter/tailmatter
- markitect content-stats [path] - Content statistics
Following TDD8 methodology, these tests MUST FAIL initially.
"""
import pytest
import tempfile
import os
from pathlib import Path
from click.testing import CliRunner
from markitect_content.parser import ContentParser
from markitect_content.stats import ContentStats
from markitect_content.commands import content_get, content_stats
class TestContentExtraction:
"""Test content extraction without matter zones."""
@pytest.fixture
def test_files_dir(self):
"""Path to test fixture files."""
return Path(__file__).parent / "fixtures" / "content_test_files"
@pytest.fixture
def content_parser(self):
"""Content parser instance."""
return ContentParser()
def test_content_get_extracts_content_without_frontmatter(self, content_parser, test_files_dir):
"""Test that content extraction removes frontmatter."""
file_path = test_files_dir / "frontmatter_only.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
# Content should not contain frontmatter delimiters or YAML
assert "---" not in content
assert "title:" not in content
assert "author:" not in content
assert "date:" not in content
# Content should contain the actual document content
assert "# Frontmatter Only Document" in content
assert "This document only has frontmatter" in content
def test_content_get_extracts_content_without_tailmatter(self, content_parser, test_files_dir):
"""Test that content extraction removes tailmatter."""
file_path = test_files_dir / "tailmatter_only.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
# Content should not contain tailmatter blocks
assert "```yaml tailmatter" not in content
assert "qa_checklist:" not in content
assert "editorial:" not in content
# Content should contain the actual document content
assert "# Tailmatter Only Document" in content
assert "This document only has tailmatter" in content
def test_content_get_extracts_content_without_both_matters(self, content_parser, test_files_dir):
"""Test that content extraction removes both frontmatter and tailmatter."""
file_path = test_files_dir / "complete_document.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
# Content should not contain any matter zones
assert "---" not in content or content.count("---") <= 1 # Allow section dividers
assert "title:" not in content
assert "```yaml tailmatter" not in content
assert "qa_checklist:" not in content
# Content should contain the main document content
assert "# Complete Test Document" in content
assert "This is the main content" in content
assert "## Section 1" in content
def test_content_get_preserves_contentmatter_inline_metadata(self, content_parser, test_files_dir):
"""Test that contentmatter (MMD key-value pairs) are preserved in content."""
file_path = test_files_dir / "contentmatter_inline.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
# Contentmatter should be preserved as it's part of the content
assert "Author: Jane Smith" in content
assert "Project: Content Testing" in content
assert "Keywords: markdown, contentmatter, testing" in content
assert "Reference: https://example.com/docs" in content
def test_content_get_handles_file_not_found(self, content_parser):
"""Test proper error handling for non-existent files."""
with pytest.raises(FileNotFoundError):
with open("non_existent_file.md", 'r') as f:
text = f.read()
content_parser.extract_content(text)
class TestContentStatistics:
"""Test content statistics calculation."""
@pytest.fixture
def test_files_dir(self):
"""Path to test fixture files."""
return Path(__file__).parent / "fixtures" / "content_test_files"
@pytest.fixture
def content_parser(self):
"""Content parser instance."""
return ContentParser()
def test_content_stats_counts_words_correctly(self, content_parser, test_files_dir):
"""Test accurate word counting in content."""
file_path = test_files_dir / "plain_markdown.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
stats = content_parser.calculate_stats(content)
# Should count words in content (exact count depends on test file)
assert stats.word_count > 0
assert isinstance(stats.word_count, int)
def test_content_stats_counts_paragraphs_correctly(self, content_parser, test_files_dir):
"""Test accurate paragraph counting."""
file_path = test_files_dir / "plain_markdown.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
stats = content_parser.calculate_stats(content)
# Should count paragraphs (non-empty text blocks)
assert stats.paragraph_count > 0
assert isinstance(stats.paragraph_count, int)
def test_content_stats_counts_lines_correctly(self, content_parser, test_files_dir):
"""Test accurate line counting."""
file_path = test_files_dir / "plain_markdown.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
stats = content_parser.calculate_stats(content)
# Should count lines in content
assert stats.line_count > 0
assert isinstance(stats.line_count, int)
def test_content_stats_excludes_frontmatter_from_counts(self, content_parser, test_files_dir):
"""Test that frontmatter is excluded from statistics."""
file_path = test_files_dir / "frontmatter_only.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
stats = content_parser.calculate_stats(content)
# Word count should not include frontmatter words
# This requires manual calculation based on test file content
assert "title:" not in content
assert stats.word_count > 0 # Should still have content words
def test_content_stats_excludes_tailmatter_from_counts(self, content_parser, test_files_dir):
"""Test that tailmatter is excluded from statistics."""
file_path = test_files_dir / "tailmatter_only.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
stats = content_parser.calculate_stats(content)
# Word count should not include tailmatter words
assert "qa_checklist:" not in content
assert stats.word_count > 0 # Should still have content words
def test_content_stats_includes_contentmatter_in_counts(self, content_parser, test_files_dir):
"""Test that contentmatter (MMD) is included in statistics."""
file_path = test_files_dir / "contentmatter_inline.md"
with open(file_path, 'r') as f:
text = f.read()
content = content_parser.extract_content(text)
stats = content_parser.calculate_stats(content)
# Should include contentmatter key-value pairs in word count
assert "Author: Jane Smith" in content
assert stats.word_count > 10 # Should include contentmatter words
class TestCLIIntegration:
"""Test CLI command integration."""
@pytest.fixture
def runner(self):
"""CLI test runner."""
return CliRunner()
@pytest.fixture
def test_files_dir(self):
"""Path to test fixture files."""
return Path(__file__).parent / "fixtures" / "content_test_files"
def test_content_get_cli_command_works(self, runner, test_files_dir):
"""Test that content-get CLI command executes successfully."""
file_path = test_files_dir / "plain_markdown.md"
result = runner.invoke(content_get, ['--file', str(file_path)])
assert result.exit_code == 0
assert "Plain Markdown Document" in result.output
# Should not contain frontmatter/tailmatter markers
assert "---" not in result.output or result.output.count("---") <= 1
def test_content_stats_cli_command_works(self, runner, test_files_dir):
"""Test that content-stats CLI command executes successfully."""
file_path = test_files_dir / "plain_markdown.md"
result = runner.invoke(content_stats, ['--file', str(file_path)])
assert result.exit_code == 0
assert "word_count" in result.output
assert "line_count" in result.output
assert "paragraph_count" in result.output
def test_content_commands_help_text_available(self, runner):
"""Test that help text is available for content commands."""
# Test content-get help
result = runner.invoke(content_get, ['--help'])
assert result.exit_code == 0
assert "Extract content without frontmatter and tailmatter" in result.output
# Test content-stats help
result = runner.invoke(content_stats, ['--help'])
assert result.exit_code == 0
assert "Calculate content statistics" in result.output
class TestContentStats:
"""Test ContentStats data class."""
def test_content_stats_creation(self):
"""Test ContentStats object creation."""
stats = ContentStats(
word_count=100,
line_count=20,
paragraph_count=5,
character_count=500
)
assert stats.word_count == 100
assert stats.line_count == 20
assert stats.paragraph_count == 5
assert stats.character_count == 500
def test_content_stats_to_dict(self):
"""Test ContentStats conversion to dictionary."""
stats = ContentStats(
word_count=100,
line_count=20,
paragraph_count=5,
character_count=500
)
stats_dict = stats.to_dict()
assert stats_dict == {
"word_count": 100,
"line_count": 20,
"paragraph_count": 5,
"character_count": 500
}