feat: implement batch processing and recursive operations (issue #17)

Complete implementation of batch processing capabilities for MarkiTect CLI:

New CLI Commands:
- markitect ingest-dir: Process all markdown files in directory with recursive support
- markitect batch-process: Process files matching glob patterns
- markitect recursive: Recursive processing with depth control

Core Features:
- Sophisticated batch processing engine with progress tracking
- Multiple error handling strategies (stop, continue, skip)
- Recursive directory traversal with configurable depth limits
- Glob pattern matching for flexible file selection
- Progress feedback with detailed processing statistics
- Integration with existing database and caching systems

Technical Implementation:
- BatchProcessor class with modular architecture
- ProgressTracker for real-time user feedback
- Comprehensive error handling and edge case management
- Support for multiple operations (ingest, status, validate)
- Depth-controlled recursive search with proper boundary handling
- Permission error resilience and graceful degradation

Testing:
- 29 comprehensive tests covering all functionality
- Edge cases: empty directories, hidden files, permission errors
- CLI integration tests with mocked database operations
- Depth logic validation and boundary condition testing
- Error handling scenarios and recovery mechanisms

All acceptance criteria fulfilled:
 Directory and recursive processing
 Glob pattern support for file selection
 Progress tracking and user feedback
 Error handling with continuation options
 Comprehensive test coverage

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-03 10:45:43 +02:00
parent a4805812f3
commit 0982e771e4
3 changed files with 1227 additions and 0 deletions

View File

@@ -0,0 +1,653 @@
"""
Tests for Issue #17: Batch Processing and Recursive Operations
This test suite verifies the batch processing functionality including:
- Directory processing with recursive support
- Glob pattern matching for file selection
- Progress tracking and error handling
- Depth control for recursive operations
"""
import pytest
import tempfile
import shutil
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
from click.testing import CliRunner
from markitect.batch_processor import (
BatchProcessor, ProcessingMode, ErrorHandling,
ProcessingResult, BatchResult, ProgressTracker,
create_file_processor
)
from markitect.cli import cli
class TestBatchProcessor:
"""Test the core BatchProcessor functionality."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
def create_test_files(self, structure):
"""Create test file structure from dict."""
for path, content in structure.items():
file_path = self.test_dir / path
file_path.parent.mkdir(parents=True, exist_ok=True)
file_path.write_text(content)
def test_find_markdown_files_non_recursive(self):
"""Test finding markdown files without recursion."""
# Create test structure
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2',
'file3.txt': 'Not markdown',
'subdir/file4.md': '# Test 4'
})
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=False)
# Should find only files in root directory
assert len(files) == 2
file_names = [f.name for f in files]
assert 'file1.md' in file_names
assert 'file2.md' in file_names
assert 'file4.md' not in file_names
def test_find_markdown_files_recursive(self):
"""Test finding markdown files with recursion."""
# Create test structure
self.create_test_files({
'file1.md': '# Test 1',
'subdir/file2.md': '# Test 2',
'subdir/nested/file3.md': '# Test 3',
'subdir/file4.txt': 'Not markdown'
})
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True)
# Should find all markdown files
assert len(files) == 3
file_names = [f.name for f in files]
assert 'file1.md' in file_names
assert 'file2.md' in file_names
assert 'file3.md' in file_names
def test_find_markdown_files_with_depth_limit(self):
"""Test recursive search with depth limit."""
# Create test structure
self.create_test_files({
'file1.md': '# Test 1',
'level1/file2.md': '# Test 2',
'level1/level2/file3.md': '# Test 3',
'level1/level2/level3/file4.md': '# Test 4'
})
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True, depth=1)
# Should find files up to depth 1
assert len(files) == 2
file_names = [f.name for f in files]
assert 'file1.md' in file_names
assert 'file2.md' in file_names
assert 'file3.md' not in file_names
assert 'file4.md' not in file_names
def test_find_markdown_files_with_pattern(self):
"""Test finding files with custom pattern."""
# Create test structure
self.create_test_files({
'file1.md': '# Test 1',
'file2.markdown': '# Test 2',
'file3.txt': 'Not markdown'
})
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, pattern='*.markdown')
# Should find only .markdown files
assert len(files) == 1
assert files[0].name == 'file2.markdown'
def test_find_files_by_glob(self):
"""Test glob pattern file finding."""
# Create test structure
self.create_test_files({
'docs/file1.md': '# Test 1',
'docs/subdir/file2.md': '# Test 2',
'src/file3.md': '# Test 3',
'file4.txt': 'Not markdown'
})
processor = BatchProcessor()
# Test recursive glob
files = processor.find_files_by_glob(str(self.test_dir / "**/*.md"))
assert len(files) == 3
# Test specific directory glob
files = processor.find_files_by_glob(str(self.test_dir / "docs/*.md"))
assert len(files) == 1
assert files[0].name == 'file1.md'
def test_process_files_success(self):
"""Test successful file processing."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2'
})
processor = BatchProcessor(show_progress=False)
files = list(self.test_dir.glob('*.md'))
def mock_processor(file_path):
return ProcessingResult(
file_path=file_path,
success=True,
message="Processed successfully"
)
result = processor.process_files(files, mock_processor, "Testing")
assert result.total_files == 2
assert result.processed == 2
assert result.succeeded == 2
assert result.failed == 0
assert result.skipped == 0
def test_process_files_with_errors(self):
"""Test file processing with errors."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2',
'file3.md': '# Test 3'
})
processor = BatchProcessor(show_progress=False, error_handling=ErrorHandling.CONTINUE)
files = list(self.test_dir.glob('*.md'))
def mock_processor(file_path):
# Fail on file2.md
if file_path.name == 'file2.md':
return ProcessingResult(
file_path=file_path,
success=False,
message="Processing failed",
error="Mock error"
)
return ProcessingResult(
file_path=file_path,
success=True,
message="Processed successfully"
)
result = processor.process_files(files, mock_processor, "Testing")
assert result.total_files == 3
assert result.processed == 3
assert result.succeeded == 2
assert result.failed == 1
assert len(result.errors) == 1
def test_process_files_stop_on_error(self):
"""Test stop-on-error behavior."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2',
'file3.md': '# Test 3'
})
processor = BatchProcessor(show_progress=False, error_handling=ErrorHandling.STOP)
files = sorted(list(self.test_dir.glob('*.md')))
def mock_processor(file_path):
# Fail on second file
if file_path.name == 'file2.md':
return ProcessingResult(
file_path=file_path,
success=False,
message="Processing failed",
error="Mock error"
)
return ProcessingResult(
file_path=file_path,
success=True,
message="Processed successfully"
)
result = processor.process_files(files, mock_processor, "Testing")
# Should stop after the error
assert result.processed == 2 # file1 success, file2 error
assert result.succeeded == 1
assert result.failed == 1
class TestProgressTracker:
"""Test the ProgressTracker functionality."""
def test_progress_tracking(self):
"""Test basic progress tracking."""
tracker = ProgressTracker(total=3, show_progress=False)
# Test successful processing
result1 = ProcessingResult(Path("file1.md"), True, "Success")
tracker.update(result1)
assert tracker.processed == 1
assert tracker.succeeded == 1
assert tracker.failed == 0
# Test failed processing
result2 = ProcessingResult(Path("file2.md"), False, "Failed", "Error message")
tracker.update(result2)
assert tracker.processed == 2
assert tracker.succeeded == 1
assert tracker.failed == 1
# Test skipped file
tracker.skip_file(Path("file3.md"), "Skipped reason")
assert tracker.skipped == 1
class TestFileProcessor:
"""Test the file processor creation and execution."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
@patch('markitect.database.DatabaseManager')
def test_ingest_processor(self, mock_db_manager):
"""Test file processor for ingestion."""
# Create test file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test content")
# Mock database manager
mock_db = Mock()
mock_db_manager.return_value = mock_db
config = {'database': 'test.db'}
processor = create_file_processor(config, ProcessingMode.INGEST)
result = processor(test_file)
assert result.success
assert result.file_path == test_file
assert "Ingested successfully" in result.message
mock_db.store_document.assert_called_once()
@patch('markitect.database.DatabaseManager')
def test_status_processor(self, mock_db_manager):
"""Test file processor for status checking."""
# Create test file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test content")
# Mock database manager
mock_db = Mock()
mock_db.get_metadata.return_value = {'id': 'test123'}
mock_db_manager.return_value = mock_db
config = {'database': 'test.db'}
processor = create_file_processor(config, ProcessingMode.STATUS)
result = processor(test_file)
assert result.success
assert result.file_path == test_file
assert "Found in database" in result.message
def test_validate_processor(self):
"""Test file processor for validation."""
# Create test file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test content")
config = {}
processor = create_file_processor(config, ProcessingMode.VALIDATE)
result = processor(test_file)
assert result.success
assert result.file_path == test_file
assert "Valid markdown" in result.message
def test_validate_processor_empty_file(self):
"""Test validation processor with empty file."""
# Create empty file
test_file = self.test_dir / "empty.md"
test_file.write_text("")
config = {}
processor = create_file_processor(config, ProcessingMode.VALIDATE)
result = processor(test_file)
assert not result.success
assert "File is empty" in result.error
class TestCLIIntegration:
"""Test CLI command integration."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
self.runner = CliRunner()
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
def create_test_files(self, structure):
"""Create test file structure from dict."""
for path, content in structure.items():
file_path = self.test_dir / path
file_path.parent.mkdir(parents=True, exist_ok=True)
file_path.write_text(content)
@patch('markitect.database.DatabaseManager')
def test_ingest_dir_command(self, mock_db_manager):
"""Test ingest-dir CLI command."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2',
'subdir/file3.md': '# Test 3'
})
# Mock database
mock_db = Mock()
mock_db_manager.return_value = mock_db
result = self.runner.invoke(cli, [
'ingest-dir', str(self.test_dir),
'--quiet'
])
assert result.exit_code == 0
# Should process 2 files (non-recursive by default)
assert mock_db.store_document.call_count == 2
@patch('markitect.database.DatabaseManager')
def test_ingest_dir_recursive(self, mock_db_manager):
"""Test ingest-dir with recursive option."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'subdir/file2.md': '# Test 2',
'subdir/nested/file3.md': '# Test 3'
})
# Mock database
mock_db = Mock()
mock_db_manager.return_value = mock_db
result = self.runner.invoke(cli, [
'ingest-dir', str(self.test_dir),
'--recursive',
'--quiet'
])
assert result.exit_code == 0
# Should process all 3 files
assert mock_db.store_document.call_count == 3
@patch('markitect.database.DatabaseManager')
def test_batch_process_command(self, mock_db_manager):
"""Test batch-process CLI command."""
# Create test files
self.create_test_files({
'docs/file1.md': '# Test 1',
'docs/file2.md': '# Test 2',
'src/file3.md': '# Test 3'
})
# Mock database
mock_db = Mock()
mock_db_manager.return_value = mock_db
# Test glob pattern
pattern = str(self.test_dir / "docs/*.md")
result = self.runner.invoke(cli, [
'batch-process', pattern,
'--operation', 'ingest',
'--quiet'
])
assert result.exit_code == 0
# Should process 2 files from docs directory
assert mock_db.store_document.call_count == 2
@patch('markitect.database.DatabaseManager')
def test_recursive_command(self, mock_db_manager):
"""Test recursive CLI command."""
# Create test files
self.create_test_files({
'level1/file1.md': '# Test 1',
'level1/level2/file2.md': '# Test 2',
'level1/level2/level3/file3.md': '# Test 3'
})
# Mock database
mock_db = Mock()
mock_db.get_metadata.side_effect = Exception("Not found")
mock_db_manager.return_value = mock_db
result = self.runner.invoke(cli, [
'recursive', str(self.test_dir),
'--depth', '2',
'--operation', 'status',
'--quiet'
])
assert result.exit_code == 0
# Should check status for files up to depth 2
assert mock_db.get_metadata.call_count == 2
def test_error_handling_stop(self):
"""Test error handling with stop strategy."""
# Create test directory with no files
result = self.runner.invoke(cli, [
'ingest-dir', str(self.test_dir),
'--error-handling', 'stop',
'--quiet'
])
# Should exit cleanly when no files found
assert result.exit_code == 0
def test_invalid_directory(self):
"""Test handling of invalid directory."""
result = self.runner.invoke(cli, [
'ingest-dir', '/nonexistent/directory',
'--quiet'
])
# Should exit with error
assert result.exit_code == 2 # Click argument validation error
@patch('markitect.database.DatabaseManager')
def test_custom_pattern(self, mock_db_manager):
"""Test custom file pattern matching."""
# Create test files with different extensions
self.create_test_files({
'file1.md': '# Test 1',
'file2.markdown': '# Test 2',
'file3.txt': 'Not markdown'
})
# Mock database
mock_db = Mock()
mock_db_manager.return_value = mock_db
result = self.runner.invoke(cli, [
'ingest-dir', str(self.test_dir),
'--pattern', '*.markdown',
'--quiet'
])
assert result.exit_code == 0
# Should process only .markdown files
assert mock_db.store_document.call_count == 1
class TestErrorHandling:
"""Test error handling scenarios."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
def test_permission_error_handling(self):
"""Test handling of permission errors."""
processor = BatchProcessor(show_progress=False)
# Mock os.listdir to raise PermissionError
with patch('pathlib.Path.iterdir') as mock_iterdir:
mock_iterdir.side_effect = PermissionError("Permission denied")
files = processor.find_markdown_files(self.test_dir)
# Should return empty list without crashing
assert files == []
def test_nonexistent_directory(self):
"""Test handling of nonexistent directories."""
processor = BatchProcessor()
with pytest.raises(FileNotFoundError):
processor.find_markdown_files(Path("/nonexistent/directory"))
def test_file_as_directory(self):
"""Test handling when a file is passed as directory."""
# Create a file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test")
processor = BatchProcessor()
with pytest.raises(NotADirectoryError):
processor.find_markdown_files(test_file)
class TestEdgeCases:
"""Test edge cases and boundary conditions."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
def test_empty_directory(self):
"""Test processing empty directory."""
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir)
assert files == []
def test_hidden_directories(self):
"""Test that hidden directories are skipped."""
# Create hidden directory
hidden_dir = self.test_dir / ".hidden"
hidden_dir.mkdir()
(hidden_dir / "test.md").write_text("# Hidden")
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True)
# Should not find files in hidden directories
assert len(files) == 0
def test_depth_zero(self):
"""Test depth=0 behavior."""
# Create nested structure
(self.test_dir / "file1.md").write_text("# Test 1")
subdir = self.test_dir / "subdir"
subdir.mkdir()
(subdir / "file2.md").write_text("# Test 2")
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True, depth=0)
# Depth 0 should only include files in the starting directory
# With our corrected logic, this should only find file1.md
assert len(files) == 1
assert files[0].name == "file1.md"
def test_very_deep_structure(self):
"""Test with very deep directory structure."""
# Create 10-level deep structure
# Start with a file at the root level
(self.test_dir / "file_root.md").write_text("# Root Test")
current_dir = self.test_dir
for i in range(10):
current_dir = current_dir / f"level{i}"
current_dir.mkdir()
(current_dir / f"file{i}.md").write_text(f"# Test {i}")
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True, depth=5)
# Should find files up to depth 5
# Root (depth 0) + levels 0-4 (depths 1-5) = 6 files
assert len(files) == 6
def test_glob_with_no_matches(self):
"""Test glob pattern with no matches."""
processor = BatchProcessor()
files = processor.find_files_by_glob(str(self.test_dir / "*.nonexistent"))
assert files == []
def test_file_deleted_during_processing(self):
"""Test handling file deletion during processing."""
# Create test file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test")
def mock_processor(file_path):
# This test is actually checking the file existence in the process_files loop
# not the processor function itself
return ProcessingResult(file_path, True, "Processed")
processor = BatchProcessor(show_progress=False)
files = [test_file]
# Delete the file after creating the file list but before processing
test_file.unlink()
result = processor.process_files(files, mock_processor, "Testing")
# Should handle gracefully - file should be skipped
assert result.skipped == 1
assert result.processed == 0