feat(prompts): implement Phase 4 - Execution Engine (FR-4, FR-5)

Implement three-stage execution lifecycle with idempotent runs and complete provenance tracking via RunManifest. Core Features: - PromptRun model with execution lifecycle stages: 1. Analysis: Template analysis and macro extraction 2. Compilation: Macro resolution and context compilation 3. Processing: LLM execution and output generation - InputBundleHash for deterministic idempotency (FR-4.3) - RunManifest for complete execution provenance (FR-5) - LLMAdapter interface for pluggable model providers - MockLLMAdapter for testing without API calls - PromptExecutionEngine orchestrating full lifecycle Idempotent Execution (FR-4.4): - Calculate SHA-256 hash of complete input context - Skip execution if identical hash exists - Cache successful runs by hash - Support force re-execution via config flag RunManifest Tracking (FR-5.2): - Template metadata (id, name, digest) - Resolved input artifacts and digests - Compiled prompt digest - Model configuration - Output artifacts - Dependency edges for graph construction - Timing metadata for performance analysis Tests (27 passing): - 17 execution model tests (config, bundle, runs, stages) - 10 engine tests (execution, idempotency, errors, caching) Implements: - FR-4.1: Three-stage execution lifecycle - FR-4.2: CompiledPrompt during compilation - FR-4.3: InputBundleHash calculation - FR-4.4: Skip execution for identical hashes - FR-5.1: RunManifest persistence - FR-5.2: Complete manifest contents - FR-5.3: Nested run linking (foundation) Files Created: - markitect/prompts/execution/models.py - markitect/prompts/execution/manifest.py - markitect/prompts/execution/llm_adapter.py - markitect/prompts/execution/engine.py - migrations/prompts/003_create_runs_and_manifests.sql - tests/unit/prompts/test_execution_models.py - tests/unit/prompts/test_execution_engine.py Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-08 23:15:33 +01:00
parent 5f463e5b20
commit c56c92c815
8 changed files with 1739 additions and 0 deletions
--- a/tests/unit/prompts/test_execution_engine.py
+++ b/tests/unit/prompts/test_execution_engine.py
@@ -0,0 +1,368 @@
+"""Unit tests for PromptExecutionEngine."""
+
+import pytest
+import tempfile
+from pathlib import Path
+
+from markitect.prompts.templates.models import PromptTemplate
+from markitect.prompts.templates.analyzer import TemplateAnalyzer
+from markitect.prompts.resolver.resolver import PromptResolver
+from markitect.prompts.resolver.compiler import ContextCompiler
+from markitect.prompts.resolver.strategy import (
+    MultiSpaceResolutionStrategy,
+    ResolutionConfig,
+)
+from markitect.prompts.execution.engine import PromptExecutionEngine
+from markitect.prompts.execution.models import RunConfig, RunStatus, ExecutionStage
+from markitect.prompts.execution.llm_adapter import MockLLMAdapter, ErrorLLMAdapter
+from markitect.prompts.services.artifact_service import ArtifactService
+from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
+
+
+@pytest.fixture
+def temp_db():
+    """Create temporary database."""
+    with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
+        db_path = f.name
+    yield db_path
+    Path(db_path).unlink(missing_ok=True)
+
+
+@pytest.fixture
+def artifact_service(temp_db):
+    """Create artifact service."""
+    repository = SQLiteArtifactRepository(temp_db)
+    return ArtifactService(repository)
+
+
+@pytest.fixture
+def analyzer():
+    """Create template analyzer."""
+    return TemplateAnalyzer()
+
+
+@pytest.fixture
+def resolver(artifact_service):
+    """Create resolver."""
+    strategy = MultiSpaceResolutionStrategy()
+    return PromptResolver(artifact_service, strategy)
+
+
+@pytest.fixture
+def compiler():
+    """Create compiler."""
+    return ContextCompiler()
+
+
+@pytest.fixture
+def mock_llm():
+    """Create mock LLM adapter."""
+    return MockLLMAdapter(mock_response="Mock LLM output")
+
+
+@pytest.fixture
+def engine(artifact_service, analyzer, resolver, compiler, mock_llm):
+    """Create execution engine."""
+    return PromptExecutionEngine(
+        artifact_service,
+        analyzer,
+        resolver,
+        compiler,
+        mock_llm,
+    )
+
+
+class TestPromptExecutionEngine:
+    """Tests for PromptExecutionEngine."""
+
+    def test_execute_simple_template(
+        self, engine, artifact_service, mock_llm
+    ):
+        """Test executing simple template."""
+        # Create dependency
+        artifact_service.create_artifact(
+            space_id="space-1",
+            name="intro",
+            content="Introduction text",
+        )
+
+        # Create template
+        content = "# Document\n{{require:intro}}\nMore content"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="doc",
+            content=content,
+        )
+
+        # Execute
+        resolution_config = ResolutionConfig(space_id="space-1")
+        run = engine.execute(template, content, resolution_config)
+
+        # Verify run
+        assert run.status == RunStatus.SUCCESS
+        assert run.stage == ExecutionStage.COMPLETE
+        assert run.completed_at is not None
+        assert "output_artifact_id" in run.metadata
+
+        # Verify LLM was called
+        assert mock_llm.call_count == 1
+        assert mock_llm.last_prompt is not None
+
+    def test_execute_with_failed_resolution(self, engine):
+        """Test execution with missing required dependency."""
+        # Create template with missing dependency
+        content = "{{require:missing-dep}}"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        run = engine.execute(template, content, resolution_config)
+
+        # Should fail during resolution
+        assert run.status == RunStatus.FAILED
+        assert "Resolution failed" in run.error_message
+
+    def test_idempotent_execution_skips_duplicate(
+        self, engine, artifact_service
+    ):
+        """Test idempotent execution (FR-4.4)."""
+        # Create dependency
+        artifact_service.create_artifact(
+            space_id="space-1",
+            name="dep",
+            content="Dependency",
+        )
+
+        content = "{{require:dep}}"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        config = RunConfig(skip_if_exists=True)
+
+        # First execution
+        run1 = engine.execute(template, content, resolution_config, config)
+        assert run1.status == RunStatus.SUCCESS
+
+        # Second execution with same inputs
+        run2 = engine.execute(template, content, resolution_config, config)
+
+        # Should be skipped
+        assert run2.status == RunStatus.SKIPPED
+        assert run2.id != run1.id  # Different run
+        assert run2.input_bundle_hash == run1.input_bundle_hash  # Same hash
+        assert "skipped_due_to" in run2.metadata
+
+    def test_execution_without_skip_reruns(
+        self, engine, artifact_service
+    ):
+        """Test execution without skip flag reruns."""
+        artifact_service.create_artifact(
+            space_id="space-1",
+            name="dep",
+            content="Dependency",
+        )
+
+        content = "{{require:dep}}"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        config = RunConfig(skip_if_exists=False)
+
+        # First execution
+        run1 = engine.execute(template, content, resolution_config, config)
+        assert run1.status == RunStatus.SUCCESS
+
+        # Second execution
+        run2 = engine.execute(template, content, resolution_config, config)
+
+        # Should execute again (not skipped)
+        assert run2.status == RunStatus.SUCCESS
+        assert run2.id != run1.id
+
+    def test_input_bundle_hash_changes_with_template(
+        self, engine, artifact_service, mock_llm
+    ):
+        """Test hash changes when template changes."""
+        artifact_service.create_artifact(
+            space_id="space-1",
+            name="dep",
+            content="Dependency",
+        )
+
+        # First template
+        content1 = "{{require:dep}} version 1"
+        template1 = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content1,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        run1 = engine.execute(template1, content1, resolution_config)
+
+        # Different template content
+        content2 = "{{require:dep}} version 2"
+        template2 = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content2,
+        )
+
+        run2 = engine.execute(template2, content2, resolution_config)
+
+        # Different hashes, both execute
+        assert run1.input_bundle_hash != run2.input_bundle_hash
+        assert run1.status == RunStatus.SUCCESS
+        assert run2.status == RunStatus.SUCCESS
+
+    def test_input_bundle_hash_changes_with_dependencies(
+        self, engine, artifact_service
+    ):
+        """Test hash changes when dependency content changes."""
+        # Create initial dependency
+        dep = artifact_service.create_artifact(
+            space_id="space-1",
+            name="dep",
+            content="Original content",
+        )
+
+        content = "{{require:dep}}"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        run1 = engine.execute(template, content, resolution_config)
+
+        # Update dependency
+        artifact_service.update_artifact_content(dep.id, "Modified content")
+
+        run2 = engine.execute(template, content, resolution_config)
+
+        # Hashes should differ
+        assert run1.input_bundle_hash != run2.input_bundle_hash
+
+    def test_execution_creates_manifest(
+        self, engine, artifact_service
+    ):
+        """Test execution creates RunManifest."""
+        artifact_service.create_artifact(
+            space_id="space-1",
+            name="dep",
+            content="Dependency",
+        )
+
+        content = "{{require:dep}}"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        run = engine.execute(template, content, resolution_config)
+
+        # Check manifest in metadata
+        assert "manifest" in run.metadata
+        manifest_data = run.metadata["manifest"]
+        assert manifest_data["run_id"] == run.id
+        assert len(manifest_data["resolved_inputs"]) == 1
+        assert len(manifest_data["output_artifacts"]) == 1
+        assert "timing_metadata" in manifest_data
+
+    def test_execution_with_llm_error(
+        self, artifact_service, analyzer, resolver, compiler
+    ):
+        """Test execution handles LLM errors."""
+        # Create engine with error adapter
+        error_llm = ErrorLLMAdapter("LLM service unavailable")
+        engine = PromptExecutionEngine(
+            artifact_service,
+            analyzer,
+            resolver,
+            compiler,
+            error_llm,
+        )
+
+        artifact_service.create_artifact(
+            space_id="space-1",
+            name="dep",
+            content="Dependency",
+        )
+
+        content = "{{require:dep}}"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        run = engine.execute(template, content, resolution_config)
+
+        # Should fail during processing
+        assert run.status == RunStatus.FAILED
+        assert "LLM service unavailable" in run.error_message
+
+    def test_get_run_by_hash(self, engine, artifact_service):
+        """Test retrieving cached run by hash."""
+        artifact_service.create_artifact(
+            space_id="space-1",
+            name="dep",
+            content="Dependency",
+        )
+
+        content = "{{require:dep}}"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        run = engine.execute(template, content, resolution_config)
+
+        # Retrieve by hash
+        cached = engine.get_run_by_hash(run.input_bundle_hash)
+        assert cached is not None
+        assert cached.id == run.id
+
+    def test_clear_cache(self, engine, artifact_service):
+        """Test clearing run cache."""
+        artifact_service.create_artifact(
+            space_id="space-1",
+            name="dep",
+            content="Dependency",
+        )
+
+        content = "{{require:dep}}"
+        template = PromptTemplate.create(
+            space_id="space-1",
+            name="test",
+            content=content,
+        )
+
+        resolution_config = ResolutionConfig(space_id="space-1")
+        run = engine.execute(template, content, resolution_config)
+
+        # Cache should have run
+        assert engine.get_run_by_hash(run.input_bundle_hash) is not None
+
+        # Clear cache
+        engine.clear_cache()
+
+        # Should be gone
+        assert engine.get_run_by_hash(run.input_bundle_hash) is None
--- a/tests/unit/prompts/test_execution_models.py
+++ b/tests/unit/prompts/test_execution_models.py
@@ -0,0 +1,240 @@
+"""Unit tests for execution models."""
+
+import pytest
+from markitect.prompts.execution.models import (
+    RunConfig,
+    InputBundle,
+    LLMResponse,
+    PromptRun,
+    ExecutionStage,
+    RunStatus,
+)
+
+
+class TestRunConfig:
+    """Tests for RunConfig."""
+
+    def test_create_default_config(self):
+        """Test creating default config."""
+        config = RunConfig()
+        assert config.model_name == "gpt-4"
+        assert config.temperature == 0.7
+        assert config.max_tokens == 2000
+        assert config.max_depth == 3
+        assert config.skip_if_exists is True
+
+    def test_create_custom_config(self):
+        """Test creating custom config."""
+        config = RunConfig(
+            model_name="gpt-3.5-turbo",
+            temperature=0.5,
+            max_tokens=1000,
+            max_depth=5,
+            skip_if_exists=False,
+        )
+        assert config.model_name == "gpt-3.5-turbo"
+        assert config.temperature == 0.5
+        assert config.max_tokens == 1000
+        assert config.max_depth == 5
+        assert config.skip_if_exists is False
+
+    def test_config_to_dict(self):
+        """Test serialization."""
+        config = RunConfig(model_name="test-model")
+        data = config.to_dict()
+        assert data["model_name"] == "test-model"
+        assert "temperature" in data
+
+    def test_config_from_dict(self):
+        """Test deserialization."""
+        data = {
+            "model_name": "custom-model",
+            "temperature": 0.9,
+            "max_tokens": 500,
+        }
+        config = RunConfig.from_dict(data)
+        assert config.model_name == "custom-model"
+        assert config.temperature == 0.9
+        assert config.max_tokens == 500
+
+
+class TestInputBundle:
+    """Tests for InputBundle."""
+
+    def test_create_input_bundle(self):
+        """Test creating input bundle."""
+        bundle = InputBundle(
+            template_digest="abc123",
+            dependency_digests={"dep1": "def456", "dep2": "ghi789"},
+            resolution_config_hash="config123",
+            model_config={"model": "gpt-4", "temp": 0.7},
+        )
+        assert bundle.template_digest == "abc123"
+        assert len(bundle.dependency_digests) == 2
+
+    def test_calculate_hash_deterministic(self):
+        """Test hash calculation is deterministic."""
+        bundle1 = InputBundle(
+            template_digest="abc",
+            dependency_digests={"a": "1", "b": "2"},
+            resolution_config_hash="conf",
+            model_config={"model": "gpt-4"},
+        )
+        bundle2 = InputBundle(
+            template_digest="abc",
+            dependency_digests={"b": "2", "a": "1"},  # Different order
+            resolution_config_hash="conf",
+            model_config={"model": "gpt-4"},
+        )
+        # Should produce same hash regardless of dict order
+        assert bundle1.calculate_hash() == bundle2.calculate_hash()
+
+    def test_calculate_hash_changes_with_content(self):
+        """Test hash changes when content changes."""
+        bundle1 = InputBundle(
+            template_digest="abc",
+            dependency_digests={},
+            resolution_config_hash="conf",
+            model_config={},
+        )
+        bundle2 = InputBundle(
+            template_digest="xyz",  # Different template
+            dependency_digests={},
+            resolution_config_hash="conf",
+            model_config={},
+        )
+        assert bundle1.calculate_hash() != bundle2.calculate_hash()
+
+    def test_to_dict_includes_hash(self):
+        """Test dictionary includes hash."""
+        bundle = InputBundle(
+            template_digest="abc",
+            dependency_digests={},
+            resolution_config_hash="conf",
+            model_config={},
+        )
+        data = bundle.to_dict()
+        assert "input_bundle_hash" in data
+        assert data["input_bundle_hash"] == bundle.calculate_hash()
+
+
+class TestLLMResponse:
+    """Tests for LLMResponse."""
+
+    def test_create_response(self):
+        """Test creating LLM response."""
+        response = LLMResponse(
+            content="Generated text",
+            model="gpt-4",
+            usage={"total_tokens": 100},
+            finish_reason="stop",
+        )
+        assert response.content == "Generated text"
+        assert response.model == "gpt-4"
+        assert response.usage["total_tokens"] == 100
+
+    def test_response_to_dict(self):
+        """Test serialization."""
+        response = LLMResponse(
+            content="Text",
+            model="gpt-4",
+        )
+        data = response.to_dict()
+        assert data["content"] == "Text"
+        assert data["model"] == "gpt-4"
+
+
+class TestPromptRun:
+    """Tests for PromptRun."""
+
+    def test_create_run(self):
+        """Test creating run."""
+        run = PromptRun.create(
+            template_id="template-1",
+            input_bundle_hash="hash123",
+        )
+        assert run.id  # Has UUID
+        assert run.template_id == "template-1"
+        assert run.input_bundle_hash == "hash123"
+        assert run.status == RunStatus.PENDING
+        assert run.stage == ExecutionStage.PENDING
+        assert run.depth == 0
+
+    def test_create_nested_run(self):
+        """Test creating nested run."""
+        run = PromptRun.create(
+            template_id="template-1",
+            input_bundle_hash="hash",
+            parent_run_id="parent-123",
+            depth=2,
+        )
+        assert run.parent_run_id == "parent-123"
+        assert run.depth == 2
+
+    def test_advance_stage(self):
+        """Test advancing execution stage."""
+        run = PromptRun.create(
+            template_id="template-1",
+            input_bundle_hash="hash",
+        )
+        assert run.stage == ExecutionStage.PENDING
+
+        run.advance_stage(ExecutionStage.ANALYSIS)
+        assert run.stage == ExecutionStage.ANALYSIS
+
+        run.advance_stage(ExecutionStage.PROCESSING)
+        assert run.stage == ExecutionStage.PROCESSING
+        assert run.status == RunStatus.RUNNING
+
+    def test_mark_complete(self):
+        """Test marking run as complete."""
+        run = PromptRun.create(
+            template_id="template-1",
+            input_bundle_hash="hash",
+        )
+        run.mark_complete()
+
+        assert run.status == RunStatus.SUCCESS
+        assert run.stage == ExecutionStage.COMPLETE
+        assert run.completed_at is not None
+        assert run.is_complete()
+
+    def test_mark_failed(self):
+        """Test marking run as failed."""
+        run = PromptRun.create(
+            template_id="template-1",
+            input_bundle_hash="hash",
+        )
+        run.mark_failed("Error message")
+
+        assert run.status == RunStatus.FAILED
+        assert run.stage == ExecutionStage.FAILED
+        assert run.error_message == "Error message"
+        assert run.completed_at is not None
+        assert run.is_complete()
+
+    def test_mark_skipped(self):
+        """Test marking run as skipped."""
+        run = PromptRun.create(
+            template_id="template-1",
+            input_bundle_hash="hash",
+        )
+        run.mark_skipped()
+
+        assert run.status == RunStatus.SKIPPED
+        assert run.completed_at is not None
+        assert run.is_complete()
+
+    def test_run_to_dict(self):
+        """Test serialization."""
+        run = PromptRun.create(
+            template_id="template-1",
+            input_bundle_hash="hash",
+        )
+        data = run.to_dict()
+
+        assert data["id"] == run.id
+        assert data["template_id"] == "template-1"
+        assert data["input_bundle_hash"] == "hash"
+        assert data["status"] == "pending"
+        assert data["stage"] == "pending"