feat(prompts): implement Phase 4 - Execution Engine (FR-4, FR-5)
Some checks failed
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / code-quality (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled

Implement three-stage execution lifecycle with idempotent runs and complete
provenance tracking via RunManifest.

Core Features:
- PromptRun model with execution lifecycle stages:
  1. Analysis: Template analysis and macro extraction
  2. Compilation: Macro resolution and context compilation
  3. Processing: LLM execution and output generation
- InputBundleHash for deterministic idempotency (FR-4.3)
- RunManifest for complete execution provenance (FR-5)
- LLMAdapter interface for pluggable model providers
- MockLLMAdapter for testing without API calls
- PromptExecutionEngine orchestrating full lifecycle

Idempotent Execution (FR-4.4):
- Calculate SHA-256 hash of complete input context
- Skip execution if identical hash exists
- Cache successful runs by hash
- Support force re-execution via config flag

RunManifest Tracking (FR-5.2):
- Template metadata (id, name, digest)
- Resolved input artifacts and digests
- Compiled prompt digest
- Model configuration
- Output artifacts
- Dependency edges for graph construction
- Timing metadata for performance analysis

Tests (27 passing):
- 17 execution model tests (config, bundle, runs, stages)
- 10 engine tests (execution, idempotency, errors, caching)

Implements:
- FR-4.1: Three-stage execution lifecycle
- FR-4.2: CompiledPrompt during compilation
- FR-4.3: InputBundleHash calculation
- FR-4.4: Skip execution for identical hashes
- FR-5.1: RunManifest persistence
- FR-5.2: Complete manifest contents
- FR-5.3: Nested run linking (foundation)

Files Created:
- markitect/prompts/execution/models.py
- markitect/prompts/execution/manifest.py
- markitect/prompts/execution/llm_adapter.py
- markitect/prompts/execution/engine.py
- migrations/prompts/003_create_runs_and_manifests.sql
- tests/unit/prompts/test_execution_models.py
- tests/unit/prompts/test_execution_engine.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-08 23:15:33 +01:00
parent 5f463e5b20
commit c56c92c815
8 changed files with 1739 additions and 0 deletions

View File

@@ -0,0 +1,368 @@
"""Unit tests for PromptExecutionEngine."""
import pytest
import tempfile
from pathlib import Path
from markitect.prompts.templates.models import PromptTemplate
from markitect.prompts.templates.analyzer import TemplateAnalyzer
from markitect.prompts.resolver.resolver import PromptResolver
from markitect.prompts.resolver.compiler import ContextCompiler
from markitect.prompts.resolver.strategy import (
MultiSpaceResolutionStrategy,
ResolutionConfig,
)
from markitect.prompts.execution.engine import PromptExecutionEngine
from markitect.prompts.execution.models import RunConfig, RunStatus, ExecutionStage
from markitect.prompts.execution.llm_adapter import MockLLMAdapter, ErrorLLMAdapter
from markitect.prompts.services.artifact_service import ArtifactService
from markitect.prompts.repositories.sqlite import SQLiteArtifactRepository
@pytest.fixture
def temp_db():
"""Create temporary database."""
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
db_path = f.name
yield db_path
Path(db_path).unlink(missing_ok=True)
@pytest.fixture
def artifact_service(temp_db):
"""Create artifact service."""
repository = SQLiteArtifactRepository(temp_db)
return ArtifactService(repository)
@pytest.fixture
def analyzer():
"""Create template analyzer."""
return TemplateAnalyzer()
@pytest.fixture
def resolver(artifact_service):
"""Create resolver."""
strategy = MultiSpaceResolutionStrategy()
return PromptResolver(artifact_service, strategy)
@pytest.fixture
def compiler():
"""Create compiler."""
return ContextCompiler()
@pytest.fixture
def mock_llm():
"""Create mock LLM adapter."""
return MockLLMAdapter(mock_response="Mock LLM output")
@pytest.fixture
def engine(artifact_service, analyzer, resolver, compiler, mock_llm):
"""Create execution engine."""
return PromptExecutionEngine(
artifact_service,
analyzer,
resolver,
compiler,
mock_llm,
)
class TestPromptExecutionEngine:
"""Tests for PromptExecutionEngine."""
def test_execute_simple_template(
self, engine, artifact_service, mock_llm
):
"""Test executing simple template."""
# Create dependency
artifact_service.create_artifact(
space_id="space-1",
name="intro",
content="Introduction text",
)
# Create template
content = "# Document\n{{require:intro}}\nMore content"
template = PromptTemplate.create(
space_id="space-1",
name="doc",
content=content,
)
# Execute
resolution_config = ResolutionConfig(space_id="space-1")
run = engine.execute(template, content, resolution_config)
# Verify run
assert run.status == RunStatus.SUCCESS
assert run.stage == ExecutionStage.COMPLETE
assert run.completed_at is not None
assert "output_artifact_id" in run.metadata
# Verify LLM was called
assert mock_llm.call_count == 1
assert mock_llm.last_prompt is not None
def test_execute_with_failed_resolution(self, engine):
"""Test execution with missing required dependency."""
# Create template with missing dependency
content = "{{require:missing-dep}}"
template = PromptTemplate.create(
space_id="space-1",
name="test",
content=content,
)
resolution_config = ResolutionConfig(space_id="space-1")
run = engine.execute(template, content, resolution_config)
# Should fail during resolution
assert run.status == RunStatus.FAILED
assert "Resolution failed" in run.error_message
def test_idempotent_execution_skips_duplicate(
self, engine, artifact_service
):
"""Test idempotent execution (FR-4.4)."""
# Create dependency
artifact_service.create_artifact(
space_id="space-1",
name="dep",
content="Dependency",
)
content = "{{require:dep}}"
template = PromptTemplate.create(
space_id="space-1",
name="test",
content=content,
)
resolution_config = ResolutionConfig(space_id="space-1")
config = RunConfig(skip_if_exists=True)
# First execution
run1 = engine.execute(template, content, resolution_config, config)
assert run1.status == RunStatus.SUCCESS
# Second execution with same inputs
run2 = engine.execute(template, content, resolution_config, config)
# Should be skipped
assert run2.status == RunStatus.SKIPPED
assert run2.id != run1.id # Different run
assert run2.input_bundle_hash == run1.input_bundle_hash # Same hash
assert "skipped_due_to" in run2.metadata
def test_execution_without_skip_reruns(
self, engine, artifact_service
):
"""Test execution without skip flag reruns."""
artifact_service.create_artifact(
space_id="space-1",
name="dep",
content="Dependency",
)
content = "{{require:dep}}"
template = PromptTemplate.create(
space_id="space-1",
name="test",
content=content,
)
resolution_config = ResolutionConfig(space_id="space-1")
config = RunConfig(skip_if_exists=False)
# First execution
run1 = engine.execute(template, content, resolution_config, config)
assert run1.status == RunStatus.SUCCESS
# Second execution
run2 = engine.execute(template, content, resolution_config, config)
# Should execute again (not skipped)
assert run2.status == RunStatus.SUCCESS
assert run2.id != run1.id
def test_input_bundle_hash_changes_with_template(
self, engine, artifact_service, mock_llm
):
"""Test hash changes when template changes."""
artifact_service.create_artifact(
space_id="space-1",
name="dep",
content="Dependency",
)
# First template
content1 = "{{require:dep}} version 1"
template1 = PromptTemplate.create(
space_id="space-1",
name="test",
content=content1,
)
resolution_config = ResolutionConfig(space_id="space-1")
run1 = engine.execute(template1, content1, resolution_config)
# Different template content
content2 = "{{require:dep}} version 2"
template2 = PromptTemplate.create(
space_id="space-1",
name="test",
content=content2,
)
run2 = engine.execute(template2, content2, resolution_config)
# Different hashes, both execute
assert run1.input_bundle_hash != run2.input_bundle_hash
assert run1.status == RunStatus.SUCCESS
assert run2.status == RunStatus.SUCCESS
def test_input_bundle_hash_changes_with_dependencies(
self, engine, artifact_service
):
"""Test hash changes when dependency content changes."""
# Create initial dependency
dep = artifact_service.create_artifact(
space_id="space-1",
name="dep",
content="Original content",
)
content = "{{require:dep}}"
template = PromptTemplate.create(
space_id="space-1",
name="test",
content=content,
)
resolution_config = ResolutionConfig(space_id="space-1")
run1 = engine.execute(template, content, resolution_config)
# Update dependency
artifact_service.update_artifact_content(dep.id, "Modified content")
run2 = engine.execute(template, content, resolution_config)
# Hashes should differ
assert run1.input_bundle_hash != run2.input_bundle_hash
def test_execution_creates_manifest(
self, engine, artifact_service
):
"""Test execution creates RunManifest."""
artifact_service.create_artifact(
space_id="space-1",
name="dep",
content="Dependency",
)
content = "{{require:dep}}"
template = PromptTemplate.create(
space_id="space-1",
name="test",
content=content,
)
resolution_config = ResolutionConfig(space_id="space-1")
run = engine.execute(template, content, resolution_config)
# Check manifest in metadata
assert "manifest" in run.metadata
manifest_data = run.metadata["manifest"]
assert manifest_data["run_id"] == run.id
assert len(manifest_data["resolved_inputs"]) == 1
assert len(manifest_data["output_artifacts"]) == 1
assert "timing_metadata" in manifest_data
def test_execution_with_llm_error(
self, artifact_service, analyzer, resolver, compiler
):
"""Test execution handles LLM errors."""
# Create engine with error adapter
error_llm = ErrorLLMAdapter("LLM service unavailable")
engine = PromptExecutionEngine(
artifact_service,
analyzer,
resolver,
compiler,
error_llm,
)
artifact_service.create_artifact(
space_id="space-1",
name="dep",
content="Dependency",
)
content = "{{require:dep}}"
template = PromptTemplate.create(
space_id="space-1",
name="test",
content=content,
)
resolution_config = ResolutionConfig(space_id="space-1")
run = engine.execute(template, content, resolution_config)
# Should fail during processing
assert run.status == RunStatus.FAILED
assert "LLM service unavailable" in run.error_message
def test_get_run_by_hash(self, engine, artifact_service):
"""Test retrieving cached run by hash."""
artifact_service.create_artifact(
space_id="space-1",
name="dep",
content="Dependency",
)
content = "{{require:dep}}"
template = PromptTemplate.create(
space_id="space-1",
name="test",
content=content,
)
resolution_config = ResolutionConfig(space_id="space-1")
run = engine.execute(template, content, resolution_config)
# Retrieve by hash
cached = engine.get_run_by_hash(run.input_bundle_hash)
assert cached is not None
assert cached.id == run.id
def test_clear_cache(self, engine, artifact_service):
"""Test clearing run cache."""
artifact_service.create_artifact(
space_id="space-1",
name="dep",
content="Dependency",
)
content = "{{require:dep}}"
template = PromptTemplate.create(
space_id="space-1",
name="test",
content=content,
)
resolution_config = ResolutionConfig(space_id="space-1")
run = engine.execute(template, content, resolution_config)
# Cache should have run
assert engine.get_run_by_hash(run.input_bundle_hash) is not None
# Clear cache
engine.clear_cache()
# Should be gone
assert engine.get_run_by_hash(run.input_bundle_hash) is None

View File

@@ -0,0 +1,240 @@
"""Unit tests for execution models."""
import pytest
from markitect.prompts.execution.models import (
RunConfig,
InputBundle,
LLMResponse,
PromptRun,
ExecutionStage,
RunStatus,
)
class TestRunConfig:
"""Tests for RunConfig."""
def test_create_default_config(self):
"""Test creating default config."""
config = RunConfig()
assert config.model_name == "gpt-4"
assert config.temperature == 0.7
assert config.max_tokens == 2000
assert config.max_depth == 3
assert config.skip_if_exists is True
def test_create_custom_config(self):
"""Test creating custom config."""
config = RunConfig(
model_name="gpt-3.5-turbo",
temperature=0.5,
max_tokens=1000,
max_depth=5,
skip_if_exists=False,
)
assert config.model_name == "gpt-3.5-turbo"
assert config.temperature == 0.5
assert config.max_tokens == 1000
assert config.max_depth == 5
assert config.skip_if_exists is False
def test_config_to_dict(self):
"""Test serialization."""
config = RunConfig(model_name="test-model")
data = config.to_dict()
assert data["model_name"] == "test-model"
assert "temperature" in data
def test_config_from_dict(self):
"""Test deserialization."""
data = {
"model_name": "custom-model",
"temperature": 0.9,
"max_tokens": 500,
}
config = RunConfig.from_dict(data)
assert config.model_name == "custom-model"
assert config.temperature == 0.9
assert config.max_tokens == 500
class TestInputBundle:
"""Tests for InputBundle."""
def test_create_input_bundle(self):
"""Test creating input bundle."""
bundle = InputBundle(
template_digest="abc123",
dependency_digests={"dep1": "def456", "dep2": "ghi789"},
resolution_config_hash="config123",
model_config={"model": "gpt-4", "temp": 0.7},
)
assert bundle.template_digest == "abc123"
assert len(bundle.dependency_digests) == 2
def test_calculate_hash_deterministic(self):
"""Test hash calculation is deterministic."""
bundle1 = InputBundle(
template_digest="abc",
dependency_digests={"a": "1", "b": "2"},
resolution_config_hash="conf",
model_config={"model": "gpt-4"},
)
bundle2 = InputBundle(
template_digest="abc",
dependency_digests={"b": "2", "a": "1"}, # Different order
resolution_config_hash="conf",
model_config={"model": "gpt-4"},
)
# Should produce same hash regardless of dict order
assert bundle1.calculate_hash() == bundle2.calculate_hash()
def test_calculate_hash_changes_with_content(self):
"""Test hash changes when content changes."""
bundle1 = InputBundle(
template_digest="abc",
dependency_digests={},
resolution_config_hash="conf",
model_config={},
)
bundle2 = InputBundle(
template_digest="xyz", # Different template
dependency_digests={},
resolution_config_hash="conf",
model_config={},
)
assert bundle1.calculate_hash() != bundle2.calculate_hash()
def test_to_dict_includes_hash(self):
"""Test dictionary includes hash."""
bundle = InputBundle(
template_digest="abc",
dependency_digests={},
resolution_config_hash="conf",
model_config={},
)
data = bundle.to_dict()
assert "input_bundle_hash" in data
assert data["input_bundle_hash"] == bundle.calculate_hash()
class TestLLMResponse:
"""Tests for LLMResponse."""
def test_create_response(self):
"""Test creating LLM response."""
response = LLMResponse(
content="Generated text",
model="gpt-4",
usage={"total_tokens": 100},
finish_reason="stop",
)
assert response.content == "Generated text"
assert response.model == "gpt-4"
assert response.usage["total_tokens"] == 100
def test_response_to_dict(self):
"""Test serialization."""
response = LLMResponse(
content="Text",
model="gpt-4",
)
data = response.to_dict()
assert data["content"] == "Text"
assert data["model"] == "gpt-4"
class TestPromptRun:
"""Tests for PromptRun."""
def test_create_run(self):
"""Test creating run."""
run = PromptRun.create(
template_id="template-1",
input_bundle_hash="hash123",
)
assert run.id # Has UUID
assert run.template_id == "template-1"
assert run.input_bundle_hash == "hash123"
assert run.status == RunStatus.PENDING
assert run.stage == ExecutionStage.PENDING
assert run.depth == 0
def test_create_nested_run(self):
"""Test creating nested run."""
run = PromptRun.create(
template_id="template-1",
input_bundle_hash="hash",
parent_run_id="parent-123",
depth=2,
)
assert run.parent_run_id == "parent-123"
assert run.depth == 2
def test_advance_stage(self):
"""Test advancing execution stage."""
run = PromptRun.create(
template_id="template-1",
input_bundle_hash="hash",
)
assert run.stage == ExecutionStage.PENDING
run.advance_stage(ExecutionStage.ANALYSIS)
assert run.stage == ExecutionStage.ANALYSIS
run.advance_stage(ExecutionStage.PROCESSING)
assert run.stage == ExecutionStage.PROCESSING
assert run.status == RunStatus.RUNNING
def test_mark_complete(self):
"""Test marking run as complete."""
run = PromptRun.create(
template_id="template-1",
input_bundle_hash="hash",
)
run.mark_complete()
assert run.status == RunStatus.SUCCESS
assert run.stage == ExecutionStage.COMPLETE
assert run.completed_at is not None
assert run.is_complete()
def test_mark_failed(self):
"""Test marking run as failed."""
run = PromptRun.create(
template_id="template-1",
input_bundle_hash="hash",
)
run.mark_failed("Error message")
assert run.status == RunStatus.FAILED
assert run.stage == ExecutionStage.FAILED
assert run.error_message == "Error message"
assert run.completed_at is not None
assert run.is_complete()
def test_mark_skipped(self):
"""Test marking run as skipped."""
run = PromptRun.create(
template_id="template-1",
input_bundle_hash="hash",
)
run.mark_skipped()
assert run.status == RunStatus.SKIPPED
assert run.completed_at is not None
assert run.is_complete()
def test_run_to_dict(self):
"""Test serialization."""
run = PromptRun.create(
template_id="template-1",
input_bundle_hash="hash",
)
data = run.to_dict()
assert data["id"] == run.id
assert data["template_id"] == "template-1"
assert data["input_bundle_hash"] == "hash"
assert data["status"] == "pending"
assert data["stage"] == "pending"