Batch classification via OpenRouter (claude-sonnet-4). 165 entities
remain unclassified due to credit exhaustion; incremental skip means
a follow-up run will complete them automatically.
Type × VSM matrix (823 entities):
S1 S2 S3 S3* S4 S5
Element 86 75 58 21 43 32 (315 total, 38%)
Process 39 42 37 17 67 24 (226 total, 28%)
Institution 4 12 30 24 . 52 (122 total, 15%)
Principle 3 7 15 2 43 32 (102 total, 12%)
Relation 2 14 5 5 22 10 (58 total, 7%)
Matrix fill: 29/30 cells (Institution/S4 empty — expected)
Metrics updated: type_entropy=2.0936, vsm_type_matrix_cells=29
Also:
- BatchEvaluator gains delay_seconds param for rate-limited providers
- classify CLI gains --rpm option (--rpm 10 for Gemini free tier)
- history.write_metrics_file now handles non-float metric values
(type_distribution is a dict, was crashing round())
- run_entity_classification forwards delay_seconds to BatchEvaluator
- classify-links and graph commands added by user (entities --by-type,
graph --format mermaid/dot, classify-links for Relation enrichment)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
175 lines
5.7 KiB
Python
175 lines
5.7 KiB
Python
"""
|
|
Batch LLM evaluation orchestrator.
|
|
|
|
Runs an evaluation prompt against a batch of items (entities, pairs,
|
|
etc.), collecting structured results. Handles:
|
|
|
|
- Incremental evaluation (skip items whose content hasn't changed)
|
|
- Progress reporting via callback
|
|
- Graceful error handling per item (one failure doesn't stop the batch)
|
|
- Aggregate token usage tracking
|
|
|
|
This is the mechanism by which infospace tooling delegates LLM work
|
|
to the platform. The adapter's own retry logic handles transient
|
|
API errors (rate limits, 5xx).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from dataclasses import dataclass, field
|
|
from typing import Any, Callable, Dict, List, Optional
|
|
|
|
from markitect.prompts.execution.llm_adapter import LLMAdapter
|
|
from markitect.prompts.execution.models import LLMResponse, RunConfig
|
|
|
|
|
|
@dataclass
|
|
class BatchItem:
|
|
"""A single item to evaluate in a batch.
|
|
|
|
Attributes:
|
|
key: Unique identifier (e.g. entity slug).
|
|
prompt: The compiled prompt text to send to the LLM.
|
|
content_digest: Hash of the source content, used for
|
|
incremental evaluation (skip if unchanged).
|
|
metadata: Arbitrary pass-through metadata.
|
|
"""
|
|
|
|
key: str
|
|
prompt: str
|
|
content_digest: str = ""
|
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class BatchResult:
|
|
"""Result for a single batch item.
|
|
|
|
Attributes:
|
|
key: Matches the input :attr:`BatchItem.key`.
|
|
status: One of ``"success"``, ``"error"``, ``"skipped"``.
|
|
response: The LLM response (``None`` if skipped or error).
|
|
error: Error message (``None`` if success or skipped).
|
|
metadata: Pass-through metadata from the input item.
|
|
"""
|
|
|
|
key: str
|
|
status: str
|
|
response: Optional[LLMResponse] = None
|
|
error: Optional[str] = None
|
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class BatchSummary:
|
|
"""Aggregate results from a batch evaluation run."""
|
|
|
|
total: int = 0
|
|
succeeded: int = 0
|
|
failed: int = 0
|
|
skipped: int = 0
|
|
results: List[BatchResult] = field(default_factory=list)
|
|
total_prompt_tokens: int = 0
|
|
total_completion_tokens: int = 0
|
|
|
|
@property
|
|
def total_tokens(self) -> int:
|
|
return self.total_prompt_tokens + self.total_completion_tokens
|
|
|
|
def success_rate(self) -> float:
|
|
"""Fraction of non-skipped items that succeeded."""
|
|
attempted = self.total - self.skipped
|
|
if attempted == 0:
|
|
return 1.0
|
|
return self.succeeded / attempted
|
|
|
|
|
|
class BatchEvaluator:
|
|
"""Orchestrates LLM evaluation across a batch of items.
|
|
|
|
Args:
|
|
adapter: The LLM adapter to use for evaluation.
|
|
config: Run configuration (model, temperature, etc.).
|
|
progress_callback: Optional ``fn(completed, total, result)``
|
|
called after each item is processed.
|
|
previous_digests: Optional ``{key: digest}`` mapping from a
|
|
previous run. Items whose digest matches are skipped.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
adapter: LLMAdapter,
|
|
config: Optional[RunConfig] = None,
|
|
progress_callback: Optional[Callable[[int, int, BatchResult], None]] = None,
|
|
previous_digests: Optional[Dict[str, str]] = None,
|
|
delay_seconds: float = 0.0,
|
|
):
|
|
self._adapter = adapter
|
|
self._config = config or RunConfig()
|
|
self._progress_callback = progress_callback
|
|
self._previous_digests = previous_digests or {}
|
|
self._delay_seconds = delay_seconds
|
|
|
|
def evaluate(self, items: List[BatchItem]) -> BatchSummary:
|
|
"""Run evaluation for all items and return aggregate results.
|
|
|
|
Items whose :attr:`~BatchItem.content_digest` matches an entry
|
|
in *previous_digests* are skipped. All other items are sent to
|
|
the LLM adapter. Errors on individual items are captured
|
|
without aborting the batch.
|
|
"""
|
|
import time as _time
|
|
|
|
summary = BatchSummary(total=len(items))
|
|
|
|
for idx, item in enumerate(items):
|
|
if idx > 0 and self._delay_seconds > 0:
|
|
_time.sleep(self._delay_seconds)
|
|
result = self._evaluate_one(item)
|
|
summary.results.append(result)
|
|
|
|
if result.status == "success":
|
|
summary.succeeded += 1
|
|
usage = result.response.usage if result.response else {}
|
|
summary.total_prompt_tokens += usage.get("prompt_tokens", 0)
|
|
summary.total_completion_tokens += usage.get("completion_tokens", 0)
|
|
elif result.status == "skipped":
|
|
summary.skipped += 1
|
|
else:
|
|
summary.failed += 1
|
|
|
|
if self._progress_callback is not None:
|
|
self._progress_callback(idx + 1, len(items), result)
|
|
|
|
return summary
|
|
|
|
def _evaluate_one(self, item: BatchItem) -> BatchResult:
|
|
"""Evaluate a single item, handling skip logic and errors."""
|
|
# Incremental: skip if digest unchanged
|
|
if (
|
|
item.content_digest
|
|
and item.key in self._previous_digests
|
|
and self._previous_digests[item.key] == item.content_digest
|
|
):
|
|
return BatchResult(
|
|
key=item.key,
|
|
status="skipped",
|
|
metadata=item.metadata,
|
|
)
|
|
|
|
try:
|
|
response = self._adapter.execute_prompt(item.prompt, self._config)
|
|
return BatchResult(
|
|
key=item.key,
|
|
status="success",
|
|
response=response,
|
|
metadata=item.metadata,
|
|
)
|
|
except Exception as exc:
|
|
return BatchResult(
|
|
key=item.key,
|
|
status="error",
|
|
error=str(exc),
|
|
metadata=item.metadata,
|
|
)
|