markitect-main/tests/test_issue_139_content_aggregation.py

"""
Test content aggregation functionality for Issue #139: Implode directory to a markdown file.

This test module covers combining content from multiple files in correct order while
preserving all markdown formatting and handling index files appropriately.
"""

import pytest
import tempfile
import shutil
from pathlib import Path
from unittest.mock import Mock, patch

# Import will fail initially (RED phase) until implementation exists
try:
    from markitect.plugins.builtin.markdown_commands import (
        aggregate_content,
        combine_markdown_files,
        preserve_markdown_formatting,
        handle_index_files,
        process_front_matter,
        ContentAggregator,
        FrontMatterConsolidator
    )
except ImportError:
    # Expected during RED phase - tests should fail initially
    aggregate_content = None
    combine_markdown_files = None
    preserve_markdown_formatting = None
    handle_index_files = None
    process_front_matter = None
    ContentAggregator = None
    FrontMatterConsolidator = None


class TestContentAggregation:
    """Test aggregating content from multiple markdown files."""

    def setup_method(self):
        """Set up temporary directory for each test."""
        self.temp_dir = Path(tempfile.mkdtemp())

    def teardown_method(self):
        """Clean up temporary directory after each test."""
        if self.temp_dir.exists():
            shutil.rmtree(self.temp_dir)

    def test_combine_simple_markdown_files(self):
        """Test combining simple markdown files in correct order."""
        # This should fail initially (RED phase)

        # Create test files
        (self.temp_dir / "01_intro.md").write_text("# Introduction\nIntro content here.")
        (self.temp_dir / "02_chapter1.md").write_text("## Chapter 1\nChapter content here.")
        (self.temp_dir / "03_conclusion.md").write_text("# Conclusion\nConclusion content.")

        files = [
            self.temp_dir / "01_intro.md",
            self.temp_dir / "02_chapter1.md",
            self.temp_dir / "03_conclusion.md"
        ]

        combined_content = combine_markdown_files(files)

        # Should combine in order with proper spacing
        assert "# Introduction" in combined_content
        assert "## Chapter 1" in combined_content
        assert "# Conclusion" in combined_content

        # Check order is maintained
        intro_pos = combined_content.find("# Introduction")
        chapter_pos = combined_content.find("## Chapter 1")
        conclusion_pos = combined_content.find("# Conclusion")

        assert intro_pos < chapter_pos < conclusion_pos

    def test_preserve_markdown_formatting(self):
        """Test that all markdown formatting is preserved during aggregation."""
        # This should fail initially (RED phase)

        markdown_content = """# Test Section

## Subsection with **bold** and *italic*

Here's some code:

```python
def example():
    return "preserved"
```

| Table | Header |
|-------|--------|
| Cell  | Data   |

- List item 1
- List item 2
  - Nested item

> Blockquote text

[Link text](http://example.com)

![Image alt](image.png)
"""

        (self.temp_dir / "formatted.md").write_text(markdown_content)

        preserved = preserve_markdown_formatting([self.temp_dir / "formatted.md"])

        # Should preserve all formatting elements
        assert "**bold**" in preserved
        assert "*italic*" in preserved
        assert "```python" in preserved
        assert "| Table | Header |" in preserved
        assert "- List item 1" in preserved
        assert "> Blockquote text" in preserved
        assert "[Link text]" in preserved
        assert "![Image alt]" in preserved

    def test_handle_index_files_as_parent_content(self):
        """Test handling index.md files as parent section content."""
        # This should fail initially (RED phase)

        # Create directory structure with index files
        part_dir = self.temp_dir / "part_1_introduction"
        part_dir.mkdir()
        (part_dir / "index.md").write_text("# Part 1: Introduction\nPart introduction content.")

        chapter_dir = part_dir / "chapter_1_overview"
        chapter_dir.mkdir()
        (chapter_dir / "index.md").write_text("## Chapter 1: Overview\nChapter overview content.")
        (chapter_dir / "section_1_1.md").write_text("### Section 1.1\nSection content.")

        aggregated = handle_index_files(self.temp_dir)

        # Should treat index.md files as parent section content
        assert "# Part 1: Introduction" in aggregated
        assert "Part introduction content." in aggregated
        assert "## Chapter 1: Overview" in aggregated
        assert "Chapter overview content." in aggregated
        assert "### Section 1.1" in aggregated

    def test_maintain_proper_spacing_between_sections(self):
        """Test maintaining appropriate whitespace between combined sections."""
        # This should fail initially (RED phase)

        files_content = [
            ("section1.md", "# Section 1\nContent 1"),
            ("section2.md", "# Section 2\nContent 2"),
            ("section3.md", "# Section 3\nContent 3")
        ]

        files = []
        for filename, content in files_content:
            file_path = self.temp_dir / filename
            file_path.write_text(content)
            files.append(file_path)

        combined = combine_markdown_files(files)

        # Should have proper spacing between sections
        lines = combined.split('\n')

        # Find section boundaries and check spacing
        section1_end = None
        section2_start = None

        for i, line in enumerate(lines):
            if line == "Content 1":
                section1_end = i
            elif line == "# Section 2":
                section2_start = i
                break

        # Should have appropriate spacing between sections
        assert section2_start is not None
        assert section1_end is not None
        assert section2_start > section1_end + 1  # At least one empty line

    def test_process_files_in_hierarchical_order(self):
        """Test processing files in logical hierarchical order."""
        # This should fail initially (RED phase)

        # Create hierarchical structure
        structure = [
            ("part_1", "index.md", "# Part 1\nPart content"),
            ("part_1/chapter_1", "index.md", "## Chapter 1\nChapter content"),
            ("part_1/chapter_1", "section_1_1.md", "### Section 1.1\nSection content"),
            ("part_1/chapter_1", "section_1_2.md", "### Section 1.2\nMore section content"),
            ("part_1", "chapter_2.md", "## Chapter 2\nChapter 2 content")
        ]

        for dir_path, filename, content in structure:
            full_dir = self.temp_dir / dir_path
            full_dir.mkdir(parents=True, exist_ok=True)
            (full_dir / filename).write_text(content)

        aggregated = aggregate_content(self.temp_dir)

        # Should maintain hierarchical order
        part_pos = aggregated.find("# Part 1")
        ch1_pos = aggregated.find("## Chapter 1")
        sec11_pos = aggregated.find("### Section 1.1")
        sec12_pos = aggregated.find("### Section 1.2")
        ch2_pos = aggregated.find("## Chapter 2")

        assert part_pos < ch1_pos < sec11_pos < sec12_pos < ch2_pos

    def test_handle_empty_files_gracefully(self):
        """Test handling empty markdown files during aggregation."""
        # This should fail initially (RED phase)

        # Create files with various content states
        (self.temp_dir / "empty.md").write_text("")
        (self.temp_dir / "whitespace_only.md").write_text("   \n\t\n  ")
        (self.temp_dir / "content.md").write_text("# Real Content\nActual content here.")

        files = [
            self.temp_dir / "empty.md",
            self.temp_dir / "whitespace_only.md",
            self.temp_dir / "content.md"
        ]

        combined = combine_markdown_files(files)

        # Should handle empty files gracefully
        assert "# Real Content" in combined
        assert "Actual content here." in combined
        # Should not break or include excessive whitespace


class TestFrontMatterHandling:
    """Test front matter detection, extraction, and consolidation."""

    def setup_method(self):
        """Set up temporary directory for each test."""
        self.temp_dir = Path(tempfile.mkdtemp())

    def teardown_method(self):
        """Clean up temporary directory after each test."""
        if self.temp_dir.exists():
            shutil.rmtree(self.temp_dir)

    def test_detect_and_extract_front_matter(self):
        """Test detecting and extracting YAML front matter."""
        # This should fail initially (RED phase)

        content_with_frontmatter = """---
title: "Chapter 1"
author: "John Doe"
date: "2023-01-01"
---

# Chapter 1 Content
Actual markdown content here.
"""

        (self.temp_dir / "chapter1.md").write_text(content_with_frontmatter)

        front_matter, content = process_front_matter(self.temp_dir / "chapter1.md")

        # Should extract front matter correctly
        assert front_matter is not None
        assert "title" in front_matter
        assert front_matter["title"] == "Chapter 1"
        assert front_matter["author"] == "John Doe"

        # Should separate content correctly
        assert content.strip().startswith("# Chapter 1 Content")
        assert "---" not in content

    def test_consolidate_multiple_front_matter_blocks(self):
        """Test consolidating front matter from multiple files."""
        # This should fail initially (RED phase)

        file1_content = """---
title: "My Document"
author: "Author Name"
---

# Section 1
Content 1"""

        file2_content = """---
version: "1.0"
tags: ["documentation", "guide"]
---

# Section 2
Content 2"""

        (self.temp_dir / "file1.md").write_text(file1_content)
        (self.temp_dir / "file2.md").write_text(file2_content)

        files = [self.temp_dir / "file1.md", self.temp_dir / "file2.md"]

        consolidator = FrontMatterConsolidator()
        consolidated_fm, content = consolidator.consolidate(files)

        # Should merge front matter appropriately
        assert "title" in consolidated_fm
        assert "author" in consolidated_fm
        assert "version" in consolidated_fm
        assert "tags" in consolidated_fm

        # Content should be combined without front matter blocks
        assert "# Section 1" in content
        assert "# Section 2" in content
        assert content.count("---") == 0

    def test_handle_conflicting_front_matter(self):
        """Test handling conflicting front matter values."""
        # This should fail initially (RED phase)

        file1_content = """---
title: "Document Title"
author: "First Author"
---

# Content 1"""

        file2_content = """---
title: "Different Title"
author: "Second Author"
---

# Content 2"""

        (self.temp_dir / "file1.md").write_text(file1_content)
        (self.temp_dir / "file2.md").write_text(file2_content)

        files = [self.temp_dir / "file1.md", self.temp_dir / "file2.md"]

        consolidator = FrontMatterConsolidator(conflict_strategy="merge")
        consolidated_fm, content = consolidator.consolidate(files)

        # Should handle conflicts according to strategy
        assert "title" in consolidated_fm
        assert "author" in consolidated_fm

        # Could merge into lists, take first value, etc.
        # Exact behavior depends on implementation strategy

    def test_preserve_front_matter_in_output(self):
        """Test that consolidated front matter is properly placed in output."""
        # This should fail initially (RED phase)

        files_with_fm = [
            ("file1.md", """---
title: "Combined Document"
---
# Section 1
Content"""),
            ("file2.md", """---
tags: ["test"]
---
# Section 2
More content""")
        ]

        files = []
        for filename, content in files_with_fm:
            file_path = self.temp_dir / filename
            file_path.write_text(content)
            files.append(file_path)

        aggregated = aggregate_content(self.temp_dir, preserve_front_matter=True)

        # Should have front matter at the beginning
        lines = aggregated.split('\n')
        assert lines[0] == "---"

        # Should find closing front matter delimiter
        closing_fm_index = None
        for i, line in enumerate(lines[1:], 1):
            if line == "---":
                closing_fm_index = i
                break

        assert closing_fm_index is not None

        # Content should follow front matter
        content_start = closing_fm_index + 1
        assert content_start < len(lines)


class TestContentAggregator:
    """Test the ContentAggregator class for comprehensive content processing."""

    def setup_method(self):
        """Set up temporary directory for each test."""
        self.temp_dir = Path(tempfile.mkdtemp())

    def teardown_method(self):
        """Clean up temporary directory after each test."""
        if self.temp_dir.exists():
            shutil.rmtree(self.temp_dir)

    def test_content_aggregator_initialization(self):
        """Test creating ContentAggregator instances."""
        # This should fail initially (RED phase)

        aggregator = ContentAggregator()

        assert aggregator is not None
        assert hasattr(aggregator, 'preserve_formatting')
        assert hasattr(aggregator, 'handle_front_matter')
        assert hasattr(aggregator, 'section_spacing')

    def test_aggregator_with_custom_options(self):
        """Test aggregator with custom configuration."""
        # This should fail initially (RED phase)

        aggregator = ContentAggregator(
            preserve_formatting=True,
            handle_front_matter=True,
            section_spacing=2,
            include_toc=True
        )

        # Create test structure
        (self.temp_dir / "chapter1.md").write_text("# Chapter 1\nContent 1")
        (self.temp_dir / "chapter2.md").write_text("# Chapter 2\nContent 2")

        result = aggregator.aggregate(self.temp_dir)

        assert result is not None
        assert "# Chapter 1" in result
        assert "# Chapter 2" in result

    def test_aggregator_processes_directory_recursively(self):
        """Test that aggregator processes nested directory structures."""
        # This should fail initially (RED phase)

        # Create nested structure
        part_dir = self.temp_dir / "part1"
        part_dir.mkdir()
        (part_dir / "index.md").write_text("# Part 1\nPart content")

        chapter_dir = part_dir / "chapter1"
        chapter_dir.mkdir()
        (chapter_dir / "content.md").write_text("## Chapter 1\nChapter content")

        aggregator = ContentAggregator(recursive=True)
        result = aggregator.aggregate(self.temp_dir)

        # Should process all nested content
        assert "# Part 1" in result
        assert "## Chapter 1" in result
        assert "Part content" in result
        assert "Chapter content" in result

    def test_aggregator_sorts_content_correctly(self):
        """Test that aggregator sorts content in logical order."""
        # This should fail initially (RED phase)

        # Create files that need sorting
        files_data = [
            ("03_conclusion.md", "# Conclusion"),
            ("01_introduction.md", "# Introduction"),
            ("02_main_content.md", "# Main Content")
        ]

        for filename, content in files_data:
            (self.temp_dir / filename).write_text(content)

        aggregator = ContentAggregator(sort_files=True)
        result = aggregator.aggregate(self.temp_dir)

        # Should be in logical order
        intro_pos = result.find("# Introduction")
        main_pos = result.find("# Main Content")
        conclusion_pos = result.find("# Conclusion")

        assert intro_pos < main_pos < conclusion_pos

    def test_aggregator_handles_large_directory_structures(self):
        """Test aggregator performance with larger directory structures."""
        # This should fail initially (RED phase)

        # Create larger structure
        for i in range(10):
            part_dir = self.temp_dir / f"part_{i+1:02d}"
            part_dir.mkdir()
            (part_dir / "index.md").write_text(f"# Part {i+1}\nPart {i+1} content")

            for j in range(5):
                chapter_file = part_dir / f"chapter_{j+1:02d}.md"
                chapter_file.write_text(f"## Chapter {i+1}.{j+1}\nChapter content")

        aggregator = ContentAggregator()
        result = aggregator.aggregate(self.temp_dir)

        # Should process all content
        assert result is not None
        assert len(result) > 0

        # Should contain expected number of parts and chapters
        part_count = result.count("# Part")
        chapter_count = result.count("## Chapter")

        assert part_count >= 10
        assert chapter_count >= 50