feat: implement batch processing and recursive operations (issue #17)

Complete implementation of batch processing capabilities for MarkiTect CLI:

New CLI Commands:
- markitect ingest-dir: Process all markdown files in directory with recursive support
- markitect batch-process: Process files matching glob patterns
- markitect recursive: Recursive processing with depth control

Core Features:
- Sophisticated batch processing engine with progress tracking
- Multiple error handling strategies (stop, continue, skip)
- Recursive directory traversal with configurable depth limits
- Glob pattern matching for flexible file selection
- Progress feedback with detailed processing statistics
- Integration with existing database and caching systems

Technical Implementation:
- BatchProcessor class with modular architecture
- ProgressTracker for real-time user feedback
- Comprehensive error handling and edge case management
- Support for multiple operations (ingest, status, validate)
- Depth-controlled recursive search with proper boundary handling
- Permission error resilience and graceful degradation

Testing:
- 29 comprehensive tests covering all functionality
- Edge cases: empty directories, hidden files, permission errors
- CLI integration tests with mocked database operations
- Depth logic validation and boundary condition testing
- Error handling scenarios and recovery mechanisms

All acceptance criteria fulfilled:
 Directory and recursive processing
 Glob pattern support for file selection
 Progress tracking and user feedback
 Error handling with continuation options
 Comprehensive test coverage

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
2025-10-03 10:45:43 +02:00
parent a4805812f3
commit 0982e771e4
3 changed files with 1227 additions and 0 deletions

View File

@@ -0,0 +1,379 @@
"""
Batch Processing and Recursive Operations - Issue #17
This module provides batch processing capabilities for MarkiTect, allowing
users to process multiple files and directories recursively through CLI.
Features:
- Directory processing with recursive support
- Glob pattern matching for file selection
- Progress tracking with user feedback
- Error handling with continuation options
- Depth control for recursive operations
Commands implemented:
- ingest-dir: Process all Markdown files in directory
- batch-process: Process files matching glob pattern
- recursive operations with depth control
"""
import os
import glob
import fnmatch
from pathlib import Path
from typing import List, Optional, Dict, Any, Iterator, Callable
from dataclasses import dataclass
from enum import Enum
import click
class ProcessingMode(Enum):
"""Modes for batch processing operations."""
INGEST = "ingest"
STATUS = "status"
VALIDATE = "validate"
GENERATE = "generate"
class ErrorHandling(Enum):
"""Error handling strategies for batch operations."""
STOP = "stop" # Stop on first error
CONTINUE = "continue" # Continue processing, collect errors
SKIP = "skip" # Skip failed files, no error collection
@dataclass
class ProcessingResult:
"""Result of processing a single file."""
file_path: Path
success: bool
message: str
error: Optional[str] = None
processing_time: Optional[float] = None
@dataclass
class BatchResult:
"""Result of a batch processing operation."""
total_files: int
processed: int
succeeded: int
failed: int
skipped: int
errors: List[ProcessingResult]
processing_time: float
class ProgressTracker:
"""Progress tracking for batch operations."""
def __init__(self, total: int, show_progress: bool = True):
self.total = total
self.processed = 0
self.succeeded = 0
self.failed = 0
self.skipped = 0
self.show_progress = show_progress
def update(self, result: ProcessingResult):
"""Update progress with a processing result."""
self.processed += 1
if result.success:
self.succeeded += 1
else:
self.failed += 1
if self.show_progress:
self._display_progress(result)
def skip_file(self, file_path: Path, reason: str):
"""Mark a file as skipped."""
self.skipped += 1
if self.show_progress:
click.echo(f"⚠️ Skipped {file_path}: {reason}")
def _display_progress(self, result: ProcessingResult):
"""Display progress information."""
status = "" if result.success else ""
percentage = (self.processed / self.total) * 100
click.echo(f"{status} [{self.processed}/{self.total}] ({percentage:.1f}%) {result.file_path}")
if not result.success and result.error:
click.echo(f" Error: {result.error}")
class BatchProcessor:
"""Core batch processing engine."""
def __init__(self,
error_handling: ErrorHandling = ErrorHandling.CONTINUE,
show_progress: bool = True,
max_depth: Optional[int] = None):
self.error_handling = error_handling
self.show_progress = show_progress
self.max_depth = max_depth
def find_markdown_files(self,
directory: Path,
pattern: str = "*.md",
recursive: bool = False,
depth: Optional[int] = None) -> List[Path]:
"""
Find markdown files in directory with pattern matching.
Args:
directory: Directory to search
pattern: Glob pattern for file matching
recursive: Whether to search recursively
depth: Maximum depth for recursive search
Returns:
List of matching file paths
"""
files = []
if not directory.exists():
raise FileNotFoundError(f"Directory not found: {directory}")
if not directory.is_dir():
raise NotADirectoryError(f"Path is not a directory: {directory}")
if recursive:
effective_depth = depth if depth is not None else self.max_depth
files.extend(self._find_recursive(directory, pattern, effective_depth))
else:
# Non-recursive: only current directory
files.extend(self._find_in_directory(directory, pattern))
return sorted(files)
def _find_recursive(self, directory: Path, pattern: str, max_depth: Optional[int]) -> List[Path]:
"""Find files recursively with depth control."""
files = []
def _search(current_dir: Path, current_depth: int):
# Add files from current directory (if within depth limit)
if max_depth is None or current_depth <= max_depth:
files.extend(self._find_in_directory(current_dir, pattern))
# Recurse into subdirectories (if we haven't reached depth limit)
if max_depth is None or current_depth < max_depth:
try:
for item in current_dir.iterdir():
if item.is_dir() and not item.name.startswith('.'):
_search(item, current_depth + 1)
except PermissionError:
# Skip directories we can't access
if self.show_progress:
click.echo(f"⚠️ Permission denied: {current_dir}")
_search(directory, 0)
return files
def _find_in_directory(self, directory: Path, pattern: str) -> List[Path]:
"""Find files matching pattern in a specific directory."""
files = []
try:
for item in directory.iterdir():
if item.is_file() and fnmatch.fnmatch(item.name, pattern):
files.append(item)
except PermissionError:
if self.show_progress:
click.echo(f"⚠️ Permission denied: {directory}")
return files
def find_files_by_glob(self, glob_pattern: str) -> List[Path]:
"""
Find files using glob patterns.
Args:
glob_pattern: Glob pattern (e.g., "**/*.md", "docs/*.markdown")
Returns:
List of matching file paths
"""
matches = glob.glob(glob_pattern, recursive=True)
return [Path(match) for match in matches if Path(match).is_file()]
def process_files(self,
files: List[Path],
processor_func: Callable[[Path], ProcessingResult],
operation_name: str = "Processing") -> BatchResult:
"""
Process a list of files with progress tracking and error handling.
Args:
files: List of files to process
processor_func: Function to process each file
operation_name: Name of the operation for progress display
Returns:
BatchResult with processing statistics
"""
import time
start_time = time.time()
if self.show_progress:
click.echo(f"🚀 {operation_name} {len(files)} files...")
tracker = ProgressTracker(len(files), self.show_progress)
errors = []
for file_path in files:
try:
# Check if file still exists (might have been deleted during processing)
if not file_path.exists():
tracker.skip_file(file_path, "File no longer exists")
continue
# Process the file
result = processor_func(file_path)
tracker.update(result)
if not result.success:
errors.append(result)
# Handle errors based on strategy
if self.error_handling == ErrorHandling.STOP:
break
except Exception as e:
# Handle unexpected errors
error_result = ProcessingResult(
file_path=file_path,
success=False,
message=f"Unexpected error: {str(e)}",
error=str(e)
)
tracker.update(error_result)
errors.append(error_result)
if self.error_handling == ErrorHandling.STOP:
break
processing_time = time.time() - start_time
result = BatchResult(
total_files=len(files),
processed=tracker.processed,
succeeded=tracker.succeeded,
failed=tracker.failed,
skipped=tracker.skipped,
errors=errors,
processing_time=processing_time
)
if self.show_progress:
self._display_summary(result, operation_name)
return result
def _display_summary(self, result: BatchResult, operation_name: str):
"""Display batch processing summary."""
click.echo(f"\n📊 {operation_name} Summary:")
click.echo(f" Total files: {result.total_files}")
click.echo(f" Processed: {result.processed}")
click.echo(f" Succeeded: {result.succeeded}")
click.echo(f" Failed: {result.failed}")
click.echo(f" Skipped: {result.skipped}")
click.echo(f" Processing time: {result.processing_time:.2f}s")
if result.failed > 0:
click.echo(f"\n{result.failed} files failed:")
for error in result.errors[:10]: # Show first 10 errors
click.echo(f"{error.file_path}: {error.message}")
if len(result.errors) > 10:
click.echo(f" ... and {len(result.errors) - 10} more errors")
def create_file_processor(config: Dict[str, Any],
operation: ProcessingMode) -> Callable[[Path], ProcessingResult]:
"""
Create a file processor function for the specified operation.
Args:
config: Configuration dictionary
operation: Type of processing operation
Returns:
Function that processes a single file and returns ProcessingResult
"""
import time
def process_file(file_path: Path) -> ProcessingResult:
"""Process a single file based on the operation type."""
start_time = time.time()
try:
if operation == ProcessingMode.INGEST:
# Ingest file into database
from .database import DatabaseManager
db_manager = DatabaseManager(config.get('database'))
# Read file content
content = file_path.read_text(encoding='utf-8')
# Store in database
db_manager.store_document(str(file_path), content)
processing_time = time.time() - start_time
return ProcessingResult(
file_path=file_path,
success=True,
message="Ingested successfully",
processing_time=processing_time
)
elif operation == ProcessingMode.STATUS:
# Check file status
from .database import DatabaseManager
db_manager = DatabaseManager(config.get('database'))
try:
metadata = db_manager.get_metadata(str(file_path))
message = f"Found in database (ID: {metadata.get('id', 'Unknown')})"
except:
message = "Not found in database"
processing_time = time.time() - start_time
return ProcessingResult(
file_path=file_path,
success=True,
message=message,
processing_time=processing_time
)
elif operation == ProcessingMode.VALIDATE:
# Validate file format/content
content = file_path.read_text(encoding='utf-8')
# Basic validation - check if it's valid markdown
if not content.strip():
raise ValueError("File is empty")
processing_time = time.time() - start_time
return ProcessingResult(
file_path=file_path,
success=True,
message="Valid markdown file",
processing_time=processing_time
)
else:
raise ValueError(f"Unsupported operation: {operation}")
except Exception as e:
processing_time = time.time() - start_time
return ProcessingResult(
file_path=file_path,
success=False,
message=f"Failed: {str(e)}",
error=str(e),
processing_time=processing_time
)
return process_file

View File

@@ -28,6 +28,7 @@ import builtins
from .database import DatabaseManager
from .legacy_compat import LegacyMode, emit_deprecation_warning, legacy_switch_option
from .__version__ import get_version_info, get_release_info
from .batch_processor import BatchProcessor, ProcessingMode, ErrorHandling, create_file_processor
# Import legacy system components for advanced management
try:
@@ -4549,6 +4550,200 @@ def perf_history(config, limit, trend_days, output_format, output):
sys.exit(1)
# Batch Processing Commands - Issue #17
@cli.command(name='ingest-dir')
@click.argument('directory', type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path))
@click.option('--recursive', '-r', is_flag=True, help='Process directories recursively')
@click.option('--depth', type=int, help='Maximum depth for recursive processing')
@click.option('--pattern', default='*.md', help='File pattern to match (default: *.md)')
@click.option('--error-handling', type=click.Choice(['stop', 'continue', 'skip']),
default='continue', help='Error handling strategy')
@click.option('--quiet', '-q', is_flag=True, help='Suppress progress output')
@pass_config
def ingest_dir(config, directory, recursive, depth, pattern, error_handling, quiet):
"""Process all Markdown files in directory.
Ingests all markdown files found in the specified directory into the database.
Supports recursive processing with depth control and flexible error handling.
Examples:
markitect ingest-dir ./docs
markitect ingest-dir ./content --recursive --depth 3
markitect ingest-dir ./articles --pattern "*.markdown" --error-handling stop
"""
try:
# Convert error handling string to enum
error_strategy = ErrorHandling[error_handling.upper()]
# Initialize batch processor
processor = BatchProcessor(
error_handling=error_strategy,
show_progress=not quiet,
max_depth=depth
)
# Find files to process
if not quiet:
click.echo(f"🔍 Searching for files in {directory}...")
files = processor.find_markdown_files(
directory=directory,
pattern=pattern,
recursive=recursive,
depth=depth
)
if not files:
click.echo(f"📭 No files found matching pattern '{pattern}' in {directory}")
return
# Create file processor for ingestion
file_processor = create_file_processor(config, ProcessingMode.INGEST)
# Process files
result = processor.process_files(files, file_processor, "Ingesting")
# Exit with error code if there were failures
if result.failed > 0 and error_strategy == ErrorHandling.STOP:
sys.exit(1)
except Exception as e:
click.echo(f"Directory ingestion failed: {e}", err=True)
if config.get('verbose'):
import traceback
click.echo(traceback.format_exc(), err=True)
sys.exit(1)
@cli.command(name='batch-process')
@click.argument('pattern', type=str)
@click.option('--operation', type=click.Choice(['ingest', 'status', 'validate']),
default='ingest', help='Operation to perform on matched files')
@click.option('--error-handling', type=click.Choice(['stop', 'continue', 'skip']),
default='continue', help='Error handling strategy')
@click.option('--quiet', '-q', is_flag=True, help='Suppress progress output')
@pass_config
def batch_process(config, pattern, operation, error_handling, quiet):
"""Process files matching glob pattern.
Uses glob patterns to find and process files. Supports various operations
including ingestion, status checking, and validation.
Examples:
markitect batch-process "**/*.md" --operation ingest
markitect batch-process "docs/**/*.markdown" --operation status
markitect batch-process "./content/*.md" --operation validate --error-handling stop
"""
try:
# Convert strings to enums
error_strategy = ErrorHandling[error_handling.upper()]
processing_mode = ProcessingMode[operation.upper()]
# Initialize batch processor
processor = BatchProcessor(
error_handling=error_strategy,
show_progress=not quiet
)
# Find files using glob pattern
if not quiet:
click.echo(f"🔍 Searching for files matching '{pattern}'...")
files = processor.find_files_by_glob(pattern)
if not files:
click.echo(f"📭 No files found matching pattern '{pattern}'")
return
# Create file processor for the specified operation
file_processor = create_file_processor(config, processing_mode)
# Process files
operation_name = f"{operation.title()}ing"
result = processor.process_files(files, file_processor, operation_name)
# Exit with error code if there were failures
if result.failed > 0 and error_strategy == ErrorHandling.STOP:
sys.exit(1)
except Exception as e:
click.echo(f"Batch processing failed: {e}", err=True)
if config.get('verbose'):
import traceback
click.echo(traceback.format_exc(), err=True)
sys.exit(1)
@cli.command(name='recursive')
@click.argument('directory', type=click.Path(exists=True, file_okay=False, dir_okay=True, path_type=Path))
@click.option('--depth', type=int, default=None, help='Maximum recursion depth')
@click.option('--operation', type=click.Choice(['ingest', 'status', 'validate']),
default='status', help='Operation to perform')
@click.option('--pattern', default='*.md', help='File pattern to match (default: *.md)')
@click.option('--error-handling', type=click.Choice(['stop', 'continue', 'skip']),
default='continue', help='Error handling strategy')
@click.option('--quiet', '-q', is_flag=True, help='Suppress progress output')
@pass_config
def recursive(config, directory, depth, operation, pattern, error_handling, quiet):
"""Recursive processing with depth control.
Performs recursive operations on directory trees with configurable depth limits.
This command provides fine-grained control over recursive processing behavior.
Examples:
markitect recursive ./docs --depth 2 --operation ingest
markitect recursive ./content --depth 5 --operation status --pattern "*.markdown"
markitect recursive ./src --operation validate --error-handling stop
"""
try:
# Convert strings to enums
error_strategy = ErrorHandling[error_handling.upper()]
processing_mode = ProcessingMode[operation.upper()]
# Initialize batch processor with depth control
processor = BatchProcessor(
error_handling=error_strategy,
show_progress=not quiet,
max_depth=depth
)
# Find files recursively
if not quiet:
depth_str = f" (max depth: {depth})" if depth else ""
click.echo(f"🔍 Recursively searching {directory}{depth_str}...")
files = processor.find_markdown_files(
directory=directory,
pattern=pattern,
recursive=True,
depth=depth
)
if not files:
click.echo(f"📭 No files found matching pattern '{pattern}' in {directory}")
return
# Create file processor for the specified operation
file_processor = create_file_processor(config, processing_mode)
# Process files
operation_name = f"Recursively {operation}ing"
result = processor.process_files(files, file_processor, operation_name)
# Exit with error code if there were failures
if result.failed > 0 and error_strategy == ErrorHandling.STOP:
sys.exit(1)
except Exception as e:
click.echo(f"Recursive processing failed: {e}", err=True)
if config.get('verbose'):
import traceback
click.echo(traceback.format_exc(), err=True)
sys.exit(1)
# Register issue management commands
cli.add_command(issues_group)

View File

@@ -0,0 +1,653 @@
"""
Tests for Issue #17: Batch Processing and Recursive Operations
This test suite verifies the batch processing functionality including:
- Directory processing with recursive support
- Glob pattern matching for file selection
- Progress tracking and error handling
- Depth control for recursive operations
"""
import pytest
import tempfile
import shutil
from pathlib import Path
from unittest.mock import Mock, patch, MagicMock
from click.testing import CliRunner
from markitect.batch_processor import (
BatchProcessor, ProcessingMode, ErrorHandling,
ProcessingResult, BatchResult, ProgressTracker,
create_file_processor
)
from markitect.cli import cli
class TestBatchProcessor:
"""Test the core BatchProcessor functionality."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
def create_test_files(self, structure):
"""Create test file structure from dict."""
for path, content in structure.items():
file_path = self.test_dir / path
file_path.parent.mkdir(parents=True, exist_ok=True)
file_path.write_text(content)
def test_find_markdown_files_non_recursive(self):
"""Test finding markdown files without recursion."""
# Create test structure
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2',
'file3.txt': 'Not markdown',
'subdir/file4.md': '# Test 4'
})
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=False)
# Should find only files in root directory
assert len(files) == 2
file_names = [f.name for f in files]
assert 'file1.md' in file_names
assert 'file2.md' in file_names
assert 'file4.md' not in file_names
def test_find_markdown_files_recursive(self):
"""Test finding markdown files with recursion."""
# Create test structure
self.create_test_files({
'file1.md': '# Test 1',
'subdir/file2.md': '# Test 2',
'subdir/nested/file3.md': '# Test 3',
'subdir/file4.txt': 'Not markdown'
})
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True)
# Should find all markdown files
assert len(files) == 3
file_names = [f.name for f in files]
assert 'file1.md' in file_names
assert 'file2.md' in file_names
assert 'file3.md' in file_names
def test_find_markdown_files_with_depth_limit(self):
"""Test recursive search with depth limit."""
# Create test structure
self.create_test_files({
'file1.md': '# Test 1',
'level1/file2.md': '# Test 2',
'level1/level2/file3.md': '# Test 3',
'level1/level2/level3/file4.md': '# Test 4'
})
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True, depth=1)
# Should find files up to depth 1
assert len(files) == 2
file_names = [f.name for f in files]
assert 'file1.md' in file_names
assert 'file2.md' in file_names
assert 'file3.md' not in file_names
assert 'file4.md' not in file_names
def test_find_markdown_files_with_pattern(self):
"""Test finding files with custom pattern."""
# Create test structure
self.create_test_files({
'file1.md': '# Test 1',
'file2.markdown': '# Test 2',
'file3.txt': 'Not markdown'
})
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, pattern='*.markdown')
# Should find only .markdown files
assert len(files) == 1
assert files[0].name == 'file2.markdown'
def test_find_files_by_glob(self):
"""Test glob pattern file finding."""
# Create test structure
self.create_test_files({
'docs/file1.md': '# Test 1',
'docs/subdir/file2.md': '# Test 2',
'src/file3.md': '# Test 3',
'file4.txt': 'Not markdown'
})
processor = BatchProcessor()
# Test recursive glob
files = processor.find_files_by_glob(str(self.test_dir / "**/*.md"))
assert len(files) == 3
# Test specific directory glob
files = processor.find_files_by_glob(str(self.test_dir / "docs/*.md"))
assert len(files) == 1
assert files[0].name == 'file1.md'
def test_process_files_success(self):
"""Test successful file processing."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2'
})
processor = BatchProcessor(show_progress=False)
files = list(self.test_dir.glob('*.md'))
def mock_processor(file_path):
return ProcessingResult(
file_path=file_path,
success=True,
message="Processed successfully"
)
result = processor.process_files(files, mock_processor, "Testing")
assert result.total_files == 2
assert result.processed == 2
assert result.succeeded == 2
assert result.failed == 0
assert result.skipped == 0
def test_process_files_with_errors(self):
"""Test file processing with errors."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2',
'file3.md': '# Test 3'
})
processor = BatchProcessor(show_progress=False, error_handling=ErrorHandling.CONTINUE)
files = list(self.test_dir.glob('*.md'))
def mock_processor(file_path):
# Fail on file2.md
if file_path.name == 'file2.md':
return ProcessingResult(
file_path=file_path,
success=False,
message="Processing failed",
error="Mock error"
)
return ProcessingResult(
file_path=file_path,
success=True,
message="Processed successfully"
)
result = processor.process_files(files, mock_processor, "Testing")
assert result.total_files == 3
assert result.processed == 3
assert result.succeeded == 2
assert result.failed == 1
assert len(result.errors) == 1
def test_process_files_stop_on_error(self):
"""Test stop-on-error behavior."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2',
'file3.md': '# Test 3'
})
processor = BatchProcessor(show_progress=False, error_handling=ErrorHandling.STOP)
files = sorted(list(self.test_dir.glob('*.md')))
def mock_processor(file_path):
# Fail on second file
if file_path.name == 'file2.md':
return ProcessingResult(
file_path=file_path,
success=False,
message="Processing failed",
error="Mock error"
)
return ProcessingResult(
file_path=file_path,
success=True,
message="Processed successfully"
)
result = processor.process_files(files, mock_processor, "Testing")
# Should stop after the error
assert result.processed == 2 # file1 success, file2 error
assert result.succeeded == 1
assert result.failed == 1
class TestProgressTracker:
"""Test the ProgressTracker functionality."""
def test_progress_tracking(self):
"""Test basic progress tracking."""
tracker = ProgressTracker(total=3, show_progress=False)
# Test successful processing
result1 = ProcessingResult(Path("file1.md"), True, "Success")
tracker.update(result1)
assert tracker.processed == 1
assert tracker.succeeded == 1
assert tracker.failed == 0
# Test failed processing
result2 = ProcessingResult(Path("file2.md"), False, "Failed", "Error message")
tracker.update(result2)
assert tracker.processed == 2
assert tracker.succeeded == 1
assert tracker.failed == 1
# Test skipped file
tracker.skip_file(Path("file3.md"), "Skipped reason")
assert tracker.skipped == 1
class TestFileProcessor:
"""Test the file processor creation and execution."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
@patch('markitect.database.DatabaseManager')
def test_ingest_processor(self, mock_db_manager):
"""Test file processor for ingestion."""
# Create test file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test content")
# Mock database manager
mock_db = Mock()
mock_db_manager.return_value = mock_db
config = {'database': 'test.db'}
processor = create_file_processor(config, ProcessingMode.INGEST)
result = processor(test_file)
assert result.success
assert result.file_path == test_file
assert "Ingested successfully" in result.message
mock_db.store_document.assert_called_once()
@patch('markitect.database.DatabaseManager')
def test_status_processor(self, mock_db_manager):
"""Test file processor for status checking."""
# Create test file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test content")
# Mock database manager
mock_db = Mock()
mock_db.get_metadata.return_value = {'id': 'test123'}
mock_db_manager.return_value = mock_db
config = {'database': 'test.db'}
processor = create_file_processor(config, ProcessingMode.STATUS)
result = processor(test_file)
assert result.success
assert result.file_path == test_file
assert "Found in database" in result.message
def test_validate_processor(self):
"""Test file processor for validation."""
# Create test file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test content")
config = {}
processor = create_file_processor(config, ProcessingMode.VALIDATE)
result = processor(test_file)
assert result.success
assert result.file_path == test_file
assert "Valid markdown" in result.message
def test_validate_processor_empty_file(self):
"""Test validation processor with empty file."""
# Create empty file
test_file = self.test_dir / "empty.md"
test_file.write_text("")
config = {}
processor = create_file_processor(config, ProcessingMode.VALIDATE)
result = processor(test_file)
assert not result.success
assert "File is empty" in result.error
class TestCLIIntegration:
"""Test CLI command integration."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
self.runner = CliRunner()
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
def create_test_files(self, structure):
"""Create test file structure from dict."""
for path, content in structure.items():
file_path = self.test_dir / path
file_path.parent.mkdir(parents=True, exist_ok=True)
file_path.write_text(content)
@patch('markitect.database.DatabaseManager')
def test_ingest_dir_command(self, mock_db_manager):
"""Test ingest-dir CLI command."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'file2.md': '# Test 2',
'subdir/file3.md': '# Test 3'
})
# Mock database
mock_db = Mock()
mock_db_manager.return_value = mock_db
result = self.runner.invoke(cli, [
'ingest-dir', str(self.test_dir),
'--quiet'
])
assert result.exit_code == 0
# Should process 2 files (non-recursive by default)
assert mock_db.store_document.call_count == 2
@patch('markitect.database.DatabaseManager')
def test_ingest_dir_recursive(self, mock_db_manager):
"""Test ingest-dir with recursive option."""
# Create test files
self.create_test_files({
'file1.md': '# Test 1',
'subdir/file2.md': '# Test 2',
'subdir/nested/file3.md': '# Test 3'
})
# Mock database
mock_db = Mock()
mock_db_manager.return_value = mock_db
result = self.runner.invoke(cli, [
'ingest-dir', str(self.test_dir),
'--recursive',
'--quiet'
])
assert result.exit_code == 0
# Should process all 3 files
assert mock_db.store_document.call_count == 3
@patch('markitect.database.DatabaseManager')
def test_batch_process_command(self, mock_db_manager):
"""Test batch-process CLI command."""
# Create test files
self.create_test_files({
'docs/file1.md': '# Test 1',
'docs/file2.md': '# Test 2',
'src/file3.md': '# Test 3'
})
# Mock database
mock_db = Mock()
mock_db_manager.return_value = mock_db
# Test glob pattern
pattern = str(self.test_dir / "docs/*.md")
result = self.runner.invoke(cli, [
'batch-process', pattern,
'--operation', 'ingest',
'--quiet'
])
assert result.exit_code == 0
# Should process 2 files from docs directory
assert mock_db.store_document.call_count == 2
@patch('markitect.database.DatabaseManager')
def test_recursive_command(self, mock_db_manager):
"""Test recursive CLI command."""
# Create test files
self.create_test_files({
'level1/file1.md': '# Test 1',
'level1/level2/file2.md': '# Test 2',
'level1/level2/level3/file3.md': '# Test 3'
})
# Mock database
mock_db = Mock()
mock_db.get_metadata.side_effect = Exception("Not found")
mock_db_manager.return_value = mock_db
result = self.runner.invoke(cli, [
'recursive', str(self.test_dir),
'--depth', '2',
'--operation', 'status',
'--quiet'
])
assert result.exit_code == 0
# Should check status for files up to depth 2
assert mock_db.get_metadata.call_count == 2
def test_error_handling_stop(self):
"""Test error handling with stop strategy."""
# Create test directory with no files
result = self.runner.invoke(cli, [
'ingest-dir', str(self.test_dir),
'--error-handling', 'stop',
'--quiet'
])
# Should exit cleanly when no files found
assert result.exit_code == 0
def test_invalid_directory(self):
"""Test handling of invalid directory."""
result = self.runner.invoke(cli, [
'ingest-dir', '/nonexistent/directory',
'--quiet'
])
# Should exit with error
assert result.exit_code == 2 # Click argument validation error
@patch('markitect.database.DatabaseManager')
def test_custom_pattern(self, mock_db_manager):
"""Test custom file pattern matching."""
# Create test files with different extensions
self.create_test_files({
'file1.md': '# Test 1',
'file2.markdown': '# Test 2',
'file3.txt': 'Not markdown'
})
# Mock database
mock_db = Mock()
mock_db_manager.return_value = mock_db
result = self.runner.invoke(cli, [
'ingest-dir', str(self.test_dir),
'--pattern', '*.markdown',
'--quiet'
])
assert result.exit_code == 0
# Should process only .markdown files
assert mock_db.store_document.call_count == 1
class TestErrorHandling:
"""Test error handling scenarios."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
def test_permission_error_handling(self):
"""Test handling of permission errors."""
processor = BatchProcessor(show_progress=False)
# Mock os.listdir to raise PermissionError
with patch('pathlib.Path.iterdir') as mock_iterdir:
mock_iterdir.side_effect = PermissionError("Permission denied")
files = processor.find_markdown_files(self.test_dir)
# Should return empty list without crashing
assert files == []
def test_nonexistent_directory(self):
"""Test handling of nonexistent directories."""
processor = BatchProcessor()
with pytest.raises(FileNotFoundError):
processor.find_markdown_files(Path("/nonexistent/directory"))
def test_file_as_directory(self):
"""Test handling when a file is passed as directory."""
# Create a file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test")
processor = BatchProcessor()
with pytest.raises(NotADirectoryError):
processor.find_markdown_files(test_file)
class TestEdgeCases:
"""Test edge cases and boundary conditions."""
def setup_method(self):
"""Set up test environment."""
self.temp_dir = tempfile.mkdtemp()
self.test_dir = Path(self.temp_dir)
def teardown_method(self):
"""Clean up test environment."""
shutil.rmtree(self.temp_dir)
def test_empty_directory(self):
"""Test processing empty directory."""
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir)
assert files == []
def test_hidden_directories(self):
"""Test that hidden directories are skipped."""
# Create hidden directory
hidden_dir = self.test_dir / ".hidden"
hidden_dir.mkdir()
(hidden_dir / "test.md").write_text("# Hidden")
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True)
# Should not find files in hidden directories
assert len(files) == 0
def test_depth_zero(self):
"""Test depth=0 behavior."""
# Create nested structure
(self.test_dir / "file1.md").write_text("# Test 1")
subdir = self.test_dir / "subdir"
subdir.mkdir()
(subdir / "file2.md").write_text("# Test 2")
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True, depth=0)
# Depth 0 should only include files in the starting directory
# With our corrected logic, this should only find file1.md
assert len(files) == 1
assert files[0].name == "file1.md"
def test_very_deep_structure(self):
"""Test with very deep directory structure."""
# Create 10-level deep structure
# Start with a file at the root level
(self.test_dir / "file_root.md").write_text("# Root Test")
current_dir = self.test_dir
for i in range(10):
current_dir = current_dir / f"level{i}"
current_dir.mkdir()
(current_dir / f"file{i}.md").write_text(f"# Test {i}")
processor = BatchProcessor()
files = processor.find_markdown_files(self.test_dir, recursive=True, depth=5)
# Should find files up to depth 5
# Root (depth 0) + levels 0-4 (depths 1-5) = 6 files
assert len(files) == 6
def test_glob_with_no_matches(self):
"""Test glob pattern with no matches."""
processor = BatchProcessor()
files = processor.find_files_by_glob(str(self.test_dir / "*.nonexistent"))
assert files == []
def test_file_deleted_during_processing(self):
"""Test handling file deletion during processing."""
# Create test file
test_file = self.test_dir / "test.md"
test_file.write_text("# Test")
def mock_processor(file_path):
# This test is actually checking the file existence in the process_files loop
# not the processor function itself
return ProcessingResult(file_path, True, "Processed")
processor = BatchProcessor(show_progress=False)
files = [test_file]
# Delete the file after creating the file list but before processing
test_file.unlink()
result = processor.process_files(files, mock_processor, "Testing")
# Should handle gracefully - file should be skipped
assert result.skipped == 1
assert result.processed == 0