feat(prompts): add batch LLM evaluation orchestrator (S1.6)

BatchEvaluator runs evaluation prompts across item batches with incremental evaluation (skip unchanged via content digest), per-item error isolation, progress callbacks, and aggregate token usage tracking. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:40:13 +01:00
parent dc22017b7c
commit 144a88c0c2
2 changed files with 449 additions and 0 deletions
--- a/markitect/prompts/execution/batch.py
+++ b/markitect/prompts/execution/batch.py
@@ -0,0 +1,168 @@
+"""
+Batch LLM evaluation orchestrator.
+
+Runs an evaluation prompt against a batch of items (entities, pairs,
+etc.), collecting structured results.  Handles:
+
+- Incremental evaluation (skip items whose content hasn't changed)
+- Progress reporting via callback
+- Graceful error handling per item (one failure doesn't stop the batch)
+- Aggregate token usage tracking
+
+This is the mechanism by which infospace tooling delegates LLM work
+to the platform.  The adapter's own retry logic handles transient
+API errors (rate limits, 5xx).
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+
+from markitect.prompts.execution.llm_adapter import LLMAdapter
+from markitect.prompts.execution.models import LLMResponse, RunConfig
+
+
+@dataclass
+class BatchItem:
+    """A single item to evaluate in a batch.
+
+    Attributes:
+        key: Unique identifier (e.g. entity slug).
+        prompt: The compiled prompt text to send to the LLM.
+        content_digest: Hash of the source content, used for
+            incremental evaluation (skip if unchanged).
+        metadata: Arbitrary pass-through metadata.
+    """
+
+    key: str
+    prompt: str
+    content_digest: str = ""
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class BatchResult:
+    """Result for a single batch item.
+
+    Attributes:
+        key: Matches the input :attr:`BatchItem.key`.
+        status: One of ``"success"``, ``"error"``, ``"skipped"``.
+        response: The LLM response (``None`` if skipped or error).
+        error: Error message (``None`` if success or skipped).
+        metadata: Pass-through metadata from the input item.
+    """
+
+    key: str
+    status: str
+    response: Optional[LLMResponse] = None
+    error: Optional[str] = None
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+
+@dataclass
+class BatchSummary:
+    """Aggregate results from a batch evaluation run."""
+
+    total: int = 0
+    succeeded: int = 0
+    failed: int = 0
+    skipped: int = 0
+    results: List[BatchResult] = field(default_factory=list)
+    total_prompt_tokens: int = 0
+    total_completion_tokens: int = 0
+
+    @property
+    def total_tokens(self) -> int:
+        return self.total_prompt_tokens + self.total_completion_tokens
+
+    def success_rate(self) -> float:
+        """Fraction of non-skipped items that succeeded."""
+        attempted = self.total - self.skipped
+        if attempted == 0:
+            return 1.0
+        return self.succeeded / attempted
+
+
+class BatchEvaluator:
+    """Orchestrates LLM evaluation across a batch of items.
+
+    Args:
+        adapter: The LLM adapter to use for evaluation.
+        config: Run configuration (model, temperature, etc.).
+        progress_callback: Optional ``fn(completed, total, result)``
+            called after each item is processed.
+        previous_digests: Optional ``{key: digest}`` mapping from a
+            previous run.  Items whose digest matches are skipped.
+    """
+
+    def __init__(
+        self,
+        adapter: LLMAdapter,
+        config: Optional[RunConfig] = None,
+        progress_callback: Optional[Callable[[int, int, BatchResult], None]] = None,
+        previous_digests: Optional[Dict[str, str]] = None,
+    ):
+        self._adapter = adapter
+        self._config = config or RunConfig()
+        self._progress_callback = progress_callback
+        self._previous_digests = previous_digests or {}
+
+    def evaluate(self, items: List[BatchItem]) -> BatchSummary:
+        """Run evaluation for all items and return aggregate results.
+
+        Items whose :attr:`~BatchItem.content_digest` matches an entry
+        in *previous_digests* are skipped.  All other items are sent to
+        the LLM adapter.  Errors on individual items are captured
+        without aborting the batch.
+        """
+        summary = BatchSummary(total=len(items))
+
+        for idx, item in enumerate(items):
+            result = self._evaluate_one(item)
+            summary.results.append(result)
+
+            if result.status == "success":
+                summary.succeeded += 1
+                usage = result.response.usage if result.response else {}
+                summary.total_prompt_tokens += usage.get("prompt_tokens", 0)
+                summary.total_completion_tokens += usage.get("completion_tokens", 0)
+            elif result.status == "skipped":
+                summary.skipped += 1
+            else:
+                summary.failed += 1
+
+            if self._progress_callback is not None:
+                self._progress_callback(idx + 1, len(items), result)
+
+        return summary
+
+    def _evaluate_one(self, item: BatchItem) -> BatchResult:
+        """Evaluate a single item, handling skip logic and errors."""
+        # Incremental: skip if digest unchanged
+        if (
+            item.content_digest
+            and item.key in self._previous_digests
+            and self._previous_digests[item.key] == item.content_digest
+        ):
+            return BatchResult(
+                key=item.key,
+                status="skipped",
+                metadata=item.metadata,
+            )
+
+        try:
+            response = self._adapter.execute_prompt(item.prompt, self._config)
+            return BatchResult(
+                key=item.key,
+                status="success",
+                response=response,
+                metadata=item.metadata,
+            )
+        except Exception as exc:
+            return BatchResult(
+                key=item.key,
+                status="error",
+                error=str(exc),
+                metadata=item.metadata,
+            )
--- a/tests/unit/prompts/test_batch_evaluator.py
+++ b/tests/unit/prompts/test_batch_evaluator.py
@@ -0,0 +1,281 @@
+"""Tests for markitect.prompts.execution.batch."""
+
+import pytest
+
+from markitect.prompts.execution.batch import (
+    BatchEvaluator,
+    BatchItem,
+    BatchResult,
+    BatchSummary,
+)
+from markitect.prompts.execution.llm_adapter import MockLLMAdapter, ErrorLLMAdapter
+from markitect.prompts.execution.models import RunConfig, LLMResponse
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+
+
+def _items(n=3, digest_prefix="d"):
+    return [
+        BatchItem(
+            key=f"entity-{i}",
+            prompt=f"Evaluate entity {i}",
+            content_digest=f"{digest_prefix}{i}",
+            metadata={"index": i},
+        )
+        for i in range(n)
+    ]
+
+
+# ── BatchItem / BatchResult / BatchSummary ───────────────────────────
+
+
+class TestBatchModels:
+    def test_batch_item_defaults(self):
+        item = BatchItem(key="slug", prompt="text")
+        assert item.content_digest == ""
+        assert item.metadata == {}
+
+    def test_batch_result_defaults(self):
+        result = BatchResult(key="slug", status="success")
+        assert result.response is None
+        assert result.error is None
+
+    def test_summary_total_tokens(self):
+        s = BatchSummary(total_prompt_tokens=100, total_completion_tokens=50)
+        assert s.total_tokens == 150
+
+    def test_summary_success_rate_all_success(self):
+        s = BatchSummary(total=3, succeeded=3)
+        assert s.success_rate() == 1.0
+
+    def test_summary_success_rate_with_failures(self):
+        s = BatchSummary(total=4, succeeded=2, failed=2)
+        assert s.success_rate() == pytest.approx(0.5)
+
+    def test_summary_success_rate_all_skipped(self):
+        s = BatchSummary(total=3, skipped=3)
+        assert s.success_rate() == 1.0
+
+    def test_summary_success_rate_mixed(self):
+        s = BatchSummary(total=5, succeeded=2, failed=1, skipped=2)
+        # 3 attempted, 2 succeeded
+        assert s.success_rate() == pytest.approx(2 / 3)
+
+
+# ── BatchEvaluator ──────────────────────────────────────────────────
+
+
+class TestBatchEvaluator:
+    def test_evaluate_all_items(self):
+        adapter = MockLLMAdapter("result")
+        evaluator = BatchEvaluator(adapter)
+        summary = evaluator.evaluate(_items(3))
+
+        assert summary.total == 3
+        assert summary.succeeded == 3
+        assert summary.failed == 0
+        assert summary.skipped == 0
+        assert len(summary.results) == 3
+        assert adapter.call_count == 3
+
+    def test_results_preserve_keys(self):
+        adapter = MockLLMAdapter("ok")
+        evaluator = BatchEvaluator(adapter)
+        items = _items(2)
+        summary = evaluator.evaluate(items)
+
+        keys = [r.key for r in summary.results]
+        assert keys == ["entity-0", "entity-1"]
+
+    def test_results_preserve_metadata(self):
+        adapter = MockLLMAdapter("ok")
+        evaluator = BatchEvaluator(adapter)
+        items = _items(1)
+        summary = evaluator.evaluate(items)
+        assert summary.results[0].metadata == {"index": 0}
+
+    def test_response_content_available(self):
+        adapter = MockLLMAdapter("evaluated text")
+        evaluator = BatchEvaluator(adapter)
+        summary = evaluator.evaluate(_items(1))
+        assert summary.results[0].response.content == "evaluated text"
+
+    def test_token_usage_aggregated(self):
+        adapter = MockLLMAdapter("result")
+        evaluator = BatchEvaluator(adapter)
+        summary = evaluator.evaluate(_items(3))
+        assert summary.total_prompt_tokens > 0
+        assert summary.total_completion_tokens > 0
+        assert summary.total_tokens == summary.total_prompt_tokens + summary.total_completion_tokens
+
+    def test_config_passed_to_adapter(self):
+        adapter = MockLLMAdapter("ok")
+        config = RunConfig(temperature=0.1, max_tokens=500)
+        evaluator = BatchEvaluator(adapter, config=config)
+        evaluator.evaluate(_items(1))
+        assert adapter.last_config.temperature == 0.1
+        assert adapter.last_config.max_tokens == 500
+
+
+# ── Incremental evaluation ──────────────────────────────────────────
+
+
+class TestIncrementalEvaluation:
+    def test_skip_unchanged_items(self):
+        adapter = MockLLMAdapter("result")
+        previous = {"entity-0": "d0", "entity-1": "d1", "entity-2": "d2"}
+        evaluator = BatchEvaluator(adapter, previous_digests=previous)
+
+        summary = evaluator.evaluate(_items(3))
+        assert summary.skipped == 3
+        assert summary.succeeded == 0
+        assert adapter.call_count == 0
+
+    def test_evaluate_changed_items(self):
+        adapter = MockLLMAdapter("result")
+        # Only entity-0 has matching digest
+        previous = {"entity-0": "d0"}
+        evaluator = BatchEvaluator(adapter, previous_digests=previous)
+
+        summary = evaluator.evaluate(_items(3))
+        assert summary.skipped == 1
+        assert summary.succeeded == 2
+        assert adapter.call_count == 2
+
+    def test_evaluate_new_items(self):
+        adapter = MockLLMAdapter("result")
+        # Previous has different keys
+        previous = {"old-entity": "old-digest"}
+        evaluator = BatchEvaluator(adapter, previous_digests=previous)
+
+        summary = evaluator.evaluate(_items(2))
+        assert summary.skipped == 0
+        assert summary.succeeded == 2
+
+    def test_changed_digest_not_skipped(self):
+        adapter = MockLLMAdapter("result")
+        # Same key but different digest
+        previous = {"entity-0": "old-digest"}
+        evaluator = BatchEvaluator(adapter, previous_digests=previous)
+
+        summary = evaluator.evaluate(_items(1))
+        assert summary.skipped == 0
+        assert summary.succeeded == 1
+
+    def test_empty_digest_not_skipped(self):
+        adapter = MockLLMAdapter("result")
+        previous = {"entity-0": "d0"}
+        evaluator = BatchEvaluator(adapter, previous_digests=previous)
+
+        item = BatchItem(key="entity-0", prompt="eval", content_digest="")
+        summary = evaluator.evaluate([item])
+        assert summary.skipped == 0
+        assert summary.succeeded == 1
+
+    def test_skipped_status_in_result(self):
+        adapter = MockLLMAdapter("result")
+        previous = {"entity-0": "d0"}
+        evaluator = BatchEvaluator(adapter, previous_digests=previous)
+
+        summary = evaluator.evaluate(_items(1))
+        assert summary.results[0].status == "skipped"
+        assert summary.results[0].response is None
+
+
+# ── Error handling ──────────────────────────────────────────────────
+
+
+class TestBatchErrorHandling:
+    def test_error_captured_not_raised(self):
+        adapter = ErrorLLMAdapter("kaboom")
+        evaluator = BatchEvaluator(adapter)
+
+        summary = evaluator.evaluate(_items(2))
+        assert summary.failed == 2
+        assert summary.succeeded == 0
+
+    def test_error_message_in_result(self):
+        adapter = ErrorLLMAdapter("something went wrong")
+        evaluator = BatchEvaluator(adapter)
+
+        summary = evaluator.evaluate(_items(1))
+        assert summary.results[0].status == "error"
+        assert "something went wrong" in summary.results[0].error
+
+    def test_error_does_not_stop_batch(self):
+        """One failing item doesn't prevent others from running."""
+        call_count = 0
+
+        class FailOnFirstAdapter(MockLLMAdapter):
+            def execute_prompt(self, prompt, config):
+                nonlocal call_count
+                call_count += 1
+                if call_count == 1:
+                    raise RuntimeError("first fails")
+                return super().execute_prompt(prompt, config)
+
+        adapter = FailOnFirstAdapter("ok")
+        evaluator = BatchEvaluator(adapter)
+        summary = evaluator.evaluate(_items(3))
+
+        assert summary.failed == 1
+        assert summary.succeeded == 2
+        assert summary.results[0].status == "error"
+        assert summary.results[1].status == "success"
+        assert summary.results[2].status == "success"
+
+
+# ── Progress callback ───────────────────────────────────────────────
+
+
+class TestProgressCallback:
+    def test_callback_called_for_each_item(self):
+        calls = []
+        adapter = MockLLMAdapter("ok")
+        evaluator = BatchEvaluator(
+            adapter,
+            progress_callback=lambda done, total, result: calls.append(
+                (done, total, result.key)
+            ),
+        )
+        evaluator.evaluate(_items(3))
+
+        assert len(calls) == 3
+        assert calls[0] == (1, 3, "entity-0")
+        assert calls[1] == (2, 3, "entity-1")
+        assert calls[2] == (3, 3, "entity-2")
+
+    def test_callback_receives_result(self):
+        results = []
+        adapter = MockLLMAdapter("ok")
+        evaluator = BatchEvaluator(
+            adapter,
+            progress_callback=lambda done, total, result: results.append(result),
+        )
+        evaluator.evaluate(_items(2))
+
+        assert all(isinstance(r, BatchResult) for r in results)
+        assert results[0].status == "success"
+
+    def test_no_callback_no_error(self):
+        adapter = MockLLMAdapter("ok")
+        evaluator = BatchEvaluator(adapter)
+        # Should work fine without callback
+        summary = evaluator.evaluate(_items(1))
+        assert summary.succeeded == 1
+
+
+# ── Empty batch ─────────────────────────────────────────────────────
+
+
+class TestEmptyBatch:
+    def test_empty_items(self):
+        adapter = MockLLMAdapter("ok")
+        evaluator = BatchEvaluator(adapter)
+        summary = evaluator.evaluate([])
+
+        assert summary.total == 0
+        assert summary.succeeded == 0
+        assert summary.results == []
+        assert adapter.call_count == 0