Files
markitect-main/markitect/prompts/execution/batch.py
tegwick d1f57272a4 feat(example): add L2 classifications for 823/988 WoN entities (S3.4)
Batch classification via OpenRouter (claude-sonnet-4). 165 entities
remain unclassified due to credit exhaustion; incremental skip means
a follow-up run will complete them automatically.

Type × VSM matrix (823 entities):
                  S1   S2   S3  S3*   S4   S5
  Element         86   75   58   21   43   32  (315 total, 38%)
  Process         39   42   37   17   67   24  (226 total, 28%)
  Institution      4   12   30   24    .   52  (122 total, 15%)
  Principle        3    7   15    2   43   32  (102 total, 12%)
  Relation         2   14    5    5   22   10   (58 total,  7%)
  Matrix fill: 29/30 cells (Institution/S4 empty — expected)

Metrics updated: type_entropy=2.0936, vsm_type_matrix_cells=29

Also:
- BatchEvaluator gains delay_seconds param for rate-limited providers
- classify CLI gains --rpm option (--rpm 10 for Gemini free tier)
- history.write_metrics_file now handles non-float metric values
  (type_distribution is a dict, was crashing round())
- run_entity_classification forwards delay_seconds to BatchEvaluator
- classify-links and graph commands added by user (entities --by-type,
  graph --format mermaid/dot, classify-links for Relation enrichment)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-23 12:49:11 +01:00

175 lines
5.7 KiB
Python

"""
Batch LLM evaluation orchestrator.
Runs an evaluation prompt against a batch of items (entities, pairs,
etc.), collecting structured results. Handles:
- Incremental evaluation (skip items whose content hasn't changed)
- Progress reporting via callback
- Graceful error handling per item (one failure doesn't stop the batch)
- Aggregate token usage tracking
This is the mechanism by which infospace tooling delegates LLM work
to the platform. The adapter's own retry logic handles transient
API errors (rate limits, 5xx).
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional
from markitect.prompts.execution.llm_adapter import LLMAdapter
from markitect.prompts.execution.models import LLMResponse, RunConfig
@dataclass
class BatchItem:
"""A single item to evaluate in a batch.
Attributes:
key: Unique identifier (e.g. entity slug).
prompt: The compiled prompt text to send to the LLM.
content_digest: Hash of the source content, used for
incremental evaluation (skip if unchanged).
metadata: Arbitrary pass-through metadata.
"""
key: str
prompt: str
content_digest: str = ""
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class BatchResult:
"""Result for a single batch item.
Attributes:
key: Matches the input :attr:`BatchItem.key`.
status: One of ``"success"``, ``"error"``, ``"skipped"``.
response: The LLM response (``None`` if skipped or error).
error: Error message (``None`` if success or skipped).
metadata: Pass-through metadata from the input item.
"""
key: str
status: str
response: Optional[LLMResponse] = None
error: Optional[str] = None
metadata: Dict[str, Any] = field(default_factory=dict)
@dataclass
class BatchSummary:
"""Aggregate results from a batch evaluation run."""
total: int = 0
succeeded: int = 0
failed: int = 0
skipped: int = 0
results: List[BatchResult] = field(default_factory=list)
total_prompt_tokens: int = 0
total_completion_tokens: int = 0
@property
def total_tokens(self) -> int:
return self.total_prompt_tokens + self.total_completion_tokens
def success_rate(self) -> float:
"""Fraction of non-skipped items that succeeded."""
attempted = self.total - self.skipped
if attempted == 0:
return 1.0
return self.succeeded / attempted
class BatchEvaluator:
"""Orchestrates LLM evaluation across a batch of items.
Args:
adapter: The LLM adapter to use for evaluation.
config: Run configuration (model, temperature, etc.).
progress_callback: Optional ``fn(completed, total, result)``
called after each item is processed.
previous_digests: Optional ``{key: digest}`` mapping from a
previous run. Items whose digest matches are skipped.
"""
def __init__(
self,
adapter: LLMAdapter,
config: Optional[RunConfig] = None,
progress_callback: Optional[Callable[[int, int, BatchResult], None]] = None,
previous_digests: Optional[Dict[str, str]] = None,
delay_seconds: float = 0.0,
):
self._adapter = adapter
self._config = config or RunConfig()
self._progress_callback = progress_callback
self._previous_digests = previous_digests or {}
self._delay_seconds = delay_seconds
def evaluate(self, items: List[BatchItem]) -> BatchSummary:
"""Run evaluation for all items and return aggregate results.
Items whose :attr:`~BatchItem.content_digest` matches an entry
in *previous_digests* are skipped. All other items are sent to
the LLM adapter. Errors on individual items are captured
without aborting the batch.
"""
import time as _time
summary = BatchSummary(total=len(items))
for idx, item in enumerate(items):
if idx > 0 and self._delay_seconds > 0:
_time.sleep(self._delay_seconds)
result = self._evaluate_one(item)
summary.results.append(result)
if result.status == "success":
summary.succeeded += 1
usage = result.response.usage if result.response else {}
summary.total_prompt_tokens += usage.get("prompt_tokens", 0)
summary.total_completion_tokens += usage.get("completion_tokens", 0)
elif result.status == "skipped":
summary.skipped += 1
else:
summary.failed += 1
if self._progress_callback is not None:
self._progress_callback(idx + 1, len(items), result)
return summary
def _evaluate_one(self, item: BatchItem) -> BatchResult:
"""Evaluate a single item, handling skip logic and errors."""
# Incremental: skip if digest unchanged
if (
item.content_digest
and item.key in self._previous_digests
and self._previous_digests[item.key] == item.content_digest
):
return BatchResult(
key=item.key,
status="skipped",
metadata=item.metadata,
)
try:
response = self._adapter.execute_prompt(item.prompt, self._config)
return BatchResult(
key=item.key,
status="success",
response=response,
metadata=item.metadata,
)
except Exception as exc:
return BatchResult(
key=item.key,
status="error",
error=str(exc),
metadata=item.metadata,
)