generated from coulomb/repo-seed
Add adaptive cost-quality routing primitives
This commit is contained in:
239
llm_connect/grading.py
Normal file
239
llm_connect/grading.py
Normal file
@@ -0,0 +1,239 @@
|
||||
"""Baseline grading primitives for adaptive routing.
|
||||
|
||||
Graders compare a candidate adapter response against a caller-chosen baseline.
|
||||
They produce normalised quality scores that can be recorded in a
|
||||
``QualityLedger`` and consumed later by adaptive routing policy.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import re
|
||||
from dataclasses import dataclass, field, replace
|
||||
from typing import Any, Protocol
|
||||
|
||||
from llm_connect.adapter import LLMAdapter
|
||||
from llm_connect.embedding_adapter import EmbeddingAdapter
|
||||
from llm_connect.models import LLMResponse, RunConfig
|
||||
from llm_connect.similarity import cosine_similarity
|
||||
|
||||
|
||||
def _validate_score(value: float) -> float:
|
||||
if not isinstance(value, (int, float)):
|
||||
raise ValueError("quality_score must be a number between 0 and 1")
|
||||
score = float(value)
|
||||
if not 0 <= score <= 1:
|
||||
raise ValueError("quality_score must be between 0 and 1")
|
||||
return score
|
||||
|
||||
|
||||
def _normalise_text(text: str) -> str:
|
||||
return " ".join(text.strip().split())
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class GradingResult:
|
||||
"""Structured result from comparing candidate output to baseline output."""
|
||||
|
||||
quality_score: float
|
||||
notes: str
|
||||
grader_id: str
|
||||
baseline_response: LLMResponse
|
||||
candidate_response: LLMResponse
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if not str(self.grader_id).strip():
|
||||
raise ValueError("grader_id must be a non-empty string")
|
||||
object.__setattr__(self, "quality_score", _validate_score(self.quality_score))
|
||||
object.__setattr__(self, "notes", str(self.notes))
|
||||
|
||||
|
||||
class Judge(Protocol):
|
||||
"""Compare baseline and candidate responses."""
|
||||
|
||||
grader_id: str
|
||||
|
||||
def judge(
|
||||
self,
|
||||
baseline_response: LLMResponse,
|
||||
candidate_response: LLMResponse,
|
||||
*,
|
||||
prompt: str,
|
||||
run_config: RunConfig,
|
||||
) -> GradingResult:
|
||||
"""Return a quality score for candidate relative to baseline."""
|
||||
|
||||
|
||||
class BaselineGrader(Protocol):
|
||||
"""Run baseline and candidate adapters, then judge the paired responses."""
|
||||
|
||||
def grade(
|
||||
self,
|
||||
baseline_adapter: LLMAdapter,
|
||||
candidate_adapter: LLMAdapter,
|
||||
prompt: str,
|
||||
run_config: RunConfig,
|
||||
) -> GradingResult:
|
||||
"""Return a structured grading result."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ExactMatchJudge:
|
||||
"""Judge that scores 1.0 when response text matches exactly after normalisation."""
|
||||
|
||||
normalize_whitespace: bool = True
|
||||
case_sensitive: bool = True
|
||||
grader_id: str = "exact-match"
|
||||
|
||||
def judge(
|
||||
self,
|
||||
baseline_response: LLMResponse,
|
||||
candidate_response: LLMResponse,
|
||||
*,
|
||||
prompt: str,
|
||||
run_config: RunConfig,
|
||||
) -> GradingResult:
|
||||
baseline_text = baseline_response.content
|
||||
candidate_text = candidate_response.content
|
||||
if self.normalize_whitespace:
|
||||
baseline_text = _normalise_text(baseline_text)
|
||||
candidate_text = _normalise_text(candidate_text)
|
||||
if not self.case_sensitive:
|
||||
baseline_text = baseline_text.casefold()
|
||||
candidate_text = candidate_text.casefold()
|
||||
|
||||
matched = baseline_text == candidate_text
|
||||
return GradingResult(
|
||||
quality_score=1.0 if matched else 0.0,
|
||||
notes="exact match" if matched else "candidate content differs from baseline",
|
||||
grader_id=self.grader_id,
|
||||
baseline_response=baseline_response,
|
||||
candidate_response=candidate_response,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EmbeddingSimilarityJudge:
|
||||
"""Judge that maps cosine similarity between response embeddings to 0..1."""
|
||||
|
||||
embedding_adapter: EmbeddingAdapter
|
||||
grader_id: str = "embedding-similarity"
|
||||
|
||||
def judge(
|
||||
self,
|
||||
baseline_response: LLMResponse,
|
||||
candidate_response: LLMResponse,
|
||||
*,
|
||||
prompt: str,
|
||||
run_config: RunConfig,
|
||||
) -> GradingResult:
|
||||
embeddings = self.embedding_adapter.embed(
|
||||
[baseline_response.content, candidate_response.content]
|
||||
)
|
||||
if len(embeddings) != 2:
|
||||
raise ValueError("EmbeddingSimilarityJudge expected exactly two embeddings")
|
||||
|
||||
raw_similarity = cosine_similarity(embeddings[0], embeddings[1])
|
||||
quality_score = max(0.0, min(1.0, raw_similarity))
|
||||
return GradingResult(
|
||||
quality_score=quality_score,
|
||||
notes=f"cosine similarity {raw_similarity:.4f}",
|
||||
grader_id=self.grader_id,
|
||||
baseline_response=baseline_response,
|
||||
candidate_response=candidate_response,
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LLMJudge:
|
||||
"""LLM-as-judge wrapper using a fixed rubric prompt and JSON response."""
|
||||
|
||||
judge_adapter: LLMAdapter
|
||||
rubric: str = (
|
||||
"Compare the candidate response to the baseline response. "
|
||||
"Return JSON only with keys quality_score and notes. "
|
||||
"quality_score must be a number from 0 to 1."
|
||||
)
|
||||
grader_id: str = "llm-judge"
|
||||
seed: int | None = 0
|
||||
|
||||
def judge(
|
||||
self,
|
||||
baseline_response: LLMResponse,
|
||||
candidate_response: LLMResponse,
|
||||
*,
|
||||
prompt: str,
|
||||
run_config: RunConfig,
|
||||
) -> GradingResult:
|
||||
judge_prompt = self._build_prompt(prompt, baseline_response, candidate_response)
|
||||
judge_config = self._judge_config(run_config)
|
||||
response = self.judge_adapter.execute_prompt(judge_prompt, judge_config)
|
||||
parsed = self._parse_judge_response(response.content)
|
||||
return GradingResult(
|
||||
quality_score=parsed["quality_score"],
|
||||
notes=parsed["notes"],
|
||||
grader_id=self.grader_id,
|
||||
baseline_response=baseline_response,
|
||||
candidate_response=candidate_response,
|
||||
)
|
||||
|
||||
def _judge_config(self, run_config: RunConfig) -> RunConfig:
|
||||
params: dict[str, Any] = dict(run_config.model_params)
|
||||
if self.seed is not None:
|
||||
params.setdefault("seed", self.seed)
|
||||
return replace(run_config, temperature=0.0, model_params=params, budget_tracker=None)
|
||||
|
||||
def _build_prompt(
|
||||
self,
|
||||
prompt: str,
|
||||
baseline_response: LLMResponse,
|
||||
candidate_response: LLMResponse,
|
||||
) -> str:
|
||||
return (
|
||||
f"{self.rubric}\n\n"
|
||||
f"Original prompt:\n{prompt}\n\n"
|
||||
f"Baseline response:\n{baseline_response.content}\n\n"
|
||||
f"Candidate response:\n{candidate_response.content}\n"
|
||||
)
|
||||
|
||||
def _parse_judge_response(self, content: str) -> dict[str, Any]:
|
||||
try:
|
||||
data = json.loads(content)
|
||||
except json.JSONDecodeError:
|
||||
match = re.search(r"\{.*\}", content, flags=re.DOTALL)
|
||||
if not match:
|
||||
raise ValueError("LLMJudge response did not contain JSON") from None
|
||||
try:
|
||||
data = json.loads(match.group(0))
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError("LLMJudge response JSON could not be parsed") from exc
|
||||
|
||||
if not isinstance(data, dict):
|
||||
raise ValueError("LLMJudge response JSON must be an object")
|
||||
return {
|
||||
"quality_score": _validate_score(data.get("quality_score")),
|
||||
"notes": str(data.get("notes", "")),
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class PairedGrader:
|
||||
"""Baseline grader that runs both adapters and delegates comparison to a judge."""
|
||||
|
||||
judge: Judge = field(default_factory=ExactMatchJudge)
|
||||
|
||||
def grade(
|
||||
self,
|
||||
baseline_adapter: LLMAdapter,
|
||||
candidate_adapter: LLMAdapter,
|
||||
prompt: str,
|
||||
run_config: RunConfig,
|
||||
) -> GradingResult:
|
||||
baseline_response = baseline_adapter.execute_prompt(prompt, run_config)
|
||||
candidate_response = candidate_adapter.execute_prompt(prompt, run_config)
|
||||
return self.judge.judge(
|
||||
baseline_response,
|
||||
candidate_response,
|
||||
prompt=prompt,
|
||||
run_config=run_config,
|
||||
)
|
||||
Reference in New Issue
Block a user