markitect-main/markitect/prompts/execution/batch.py

"""
Batch LLM evaluation orchestrator.

Runs an evaluation prompt against a batch of items (entities, pairs,
etc.), collecting structured results.  Handles:

- Incremental evaluation (skip items whose content hasn't changed)
- Progress reporting via callback
- Graceful error handling per item (one failure doesn't stop the batch)
- Aggregate token usage tracking

This is the mechanism by which infospace tooling delegates LLM work
to the platform.  The adapter's own retry logic handles transient
API errors (rate limits, 5xx).
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional

from markitect.prompts.execution.llm_adapter import LLMAdapter
from markitect.prompts.execution.models import LLMResponse, RunConfig


@dataclass
class BatchItem:
    """A single item to evaluate in a batch.

    Attributes:
        key: Unique identifier (e.g. entity slug).
        prompt: The compiled prompt text to send to the LLM.
        content_digest: Hash of the source content, used for
            incremental evaluation (skip if unchanged).
        metadata: Arbitrary pass-through metadata.
    """

    key: str
    prompt: str
    content_digest: str = ""
    metadata: Dict[str, Any] = field(default_factory=dict)


@dataclass
class BatchResult:
    """Result for a single batch item.

    Attributes:
        key: Matches the input :attr:`BatchItem.key`.
        status: One of ``"success"``, ``"error"``, ``"skipped"``.
        response: The LLM response (``None`` if skipped or error).
        error: Error message (``None`` if success or skipped).
        metadata: Pass-through metadata from the input item.
    """

    key: str
    status: str
    response: Optional[LLMResponse] = None
    error: Optional[str] = None
    metadata: Dict[str, Any] = field(default_factory=dict)


@dataclass
class BatchSummary:
    """Aggregate results from a batch evaluation run."""

    total: int = 0
    succeeded: int = 0
    failed: int = 0
    skipped: int = 0
    results: List[BatchResult] = field(default_factory=list)
    total_prompt_tokens: int = 0
    total_completion_tokens: int = 0

    @property
    def total_tokens(self) -> int:
        return self.total_prompt_tokens + self.total_completion_tokens

    def success_rate(self) -> float:
        """Fraction of non-skipped items that succeeded."""
        attempted = self.total - self.skipped
        if attempted == 0:
            return 1.0
        return self.succeeded / attempted


class BatchEvaluator:
    """Orchestrates LLM evaluation across a batch of items.

    Args:
        adapter: The LLM adapter to use for evaluation.
        config: Run configuration (model, temperature, etc.).
        progress_callback: Optional ``fn(completed, total, result)``
            called after each item is processed.
        previous_digests: Optional ``{key: digest}`` mapping from a
            previous run.  Items whose digest matches are skipped.
    """

    def __init__(
        self,
        adapter: LLMAdapter,
        config: Optional[RunConfig] = None,
        progress_callback: Optional[Callable[[int, int, BatchResult], None]] = None,
        previous_digests: Optional[Dict[str, str]] = None,
        delay_seconds: float = 0.0,
    ):
        self._adapter = adapter
        self._config = config or RunConfig()
        self._progress_callback = progress_callback
        self._previous_digests = previous_digests or {}
        self._delay_seconds = delay_seconds

    def evaluate(self, items: List[BatchItem]) -> BatchSummary:
        """Run evaluation for all items and return aggregate results.

        Items whose :attr:`~BatchItem.content_digest` matches an entry
        in *previous_digests* are skipped.  All other items are sent to
        the LLM adapter.  Errors on individual items are captured
        without aborting the batch.
        """
        import time as _time

        summary = BatchSummary(total=len(items))

        for idx, item in enumerate(items):
            if idx > 0 and self._delay_seconds > 0:
                _time.sleep(self._delay_seconds)
            result = self._evaluate_one(item)
            summary.results.append(result)

            if result.status == "success":
                summary.succeeded += 1
                usage = result.response.usage if result.response else {}
                summary.total_prompt_tokens += usage.get("prompt_tokens", 0)
                summary.total_completion_tokens += usage.get("completion_tokens", 0)
            elif result.status == "skipped":
                summary.skipped += 1
            else:
                summary.failed += 1

            if self._progress_callback is not None:
                self._progress_callback(idx + 1, len(items), result)

        return summary

    def _evaluate_one(self, item: BatchItem) -> BatchResult:
        """Evaluate a single item, handling skip logic and errors."""
        # Incremental: skip if digest unchanged
        if (
            item.content_digest
            and item.key in self._previous_digests
            and self._previous_digests[item.key] == item.content_digest
        ):
            return BatchResult(
                key=item.key,
                status="skipped",
                metadata=item.metadata,
            )

        try:
            response = self._adapter.execute_prompt(item.prompt, self._config)
            return BatchResult(
                key=item.key,
                status="success",
                response=response,
                metadata=item.metadata,
            )
        except Exception as exc:
            return BatchResult(
                key=item.key,
                status="error",
                error=str(exc),
                metadata=item.metadata,
            )