generated from coulomb/repo-seed
Add adaptive cost-quality routing primitives
This commit is contained in:
198
tests/test_grading.py
Normal file
198
tests/test_grading.py
Normal file
@@ -0,0 +1,198 @@
|
||||
"""
|
||||
Tests for baseline grading and built-in judges.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from llm_connect.adapter import MockLLMAdapter
|
||||
from llm_connect.embedding_adapter import EmbeddingAdapter
|
||||
from llm_connect.grading import (
|
||||
EmbeddingSimilarityJudge,
|
||||
ExactMatchJudge,
|
||||
GradingResult,
|
||||
LLMJudge,
|
||||
PairedGrader,
|
||||
)
|
||||
from llm_connect.models import LLMResponse, RunConfig
|
||||
|
||||
|
||||
class StaticEmbeddingAdapter(EmbeddingAdapter):
|
||||
def __init__(self, embeddings: list[list[float]]):
|
||||
self.embeddings = embeddings
|
||||
self.seen_texts: list[str] | None = None
|
||||
|
||||
def embed(self, texts: list[str]) -> list[list[float]]:
|
||||
self.seen_texts = texts
|
||||
return self.embeddings
|
||||
|
||||
def validate(self) -> bool:
|
||||
return True
|
||||
|
||||
|
||||
def response(content: str, model: str = "m") -> LLMResponse:
|
||||
return LLMResponse(content=content, model=model)
|
||||
|
||||
|
||||
class TestGradingResult:
|
||||
def test_score_must_be_between_zero_and_one(self):
|
||||
with pytest.raises(ValueError, match="quality_score"):
|
||||
GradingResult(
|
||||
quality_score=1.5,
|
||||
notes="bad",
|
||||
grader_id="g",
|
||||
baseline_response=response("a"),
|
||||
candidate_response=response("b"),
|
||||
)
|
||||
|
||||
def test_grader_id_must_be_non_empty(self):
|
||||
with pytest.raises(ValueError, match="grader_id"):
|
||||
GradingResult(
|
||||
quality_score=1.0,
|
||||
notes="ok",
|
||||
grader_id="",
|
||||
baseline_response=response("a"),
|
||||
candidate_response=response("a"),
|
||||
)
|
||||
|
||||
|
||||
class TestExactMatchJudge:
|
||||
def test_scores_one_for_normalised_match(self):
|
||||
judge = ExactMatchJudge()
|
||||
result = judge.judge(
|
||||
response("hello world"),
|
||||
response("hello world"),
|
||||
prompt="p",
|
||||
run_config=RunConfig(),
|
||||
)
|
||||
assert result.quality_score == 1.0
|
||||
assert result.baseline_response.content == "hello world"
|
||||
assert result.candidate_response.content == "hello world"
|
||||
|
||||
def test_scores_zero_for_difference(self):
|
||||
result = ExactMatchJudge().judge(
|
||||
response("hello"),
|
||||
response("goodbye"),
|
||||
prompt="p",
|
||||
run_config=RunConfig(),
|
||||
)
|
||||
assert result.quality_score == 0.0
|
||||
|
||||
def test_case_insensitive_mode(self):
|
||||
result = ExactMatchJudge(case_sensitive=False).judge(
|
||||
response("Hello"),
|
||||
response("hello"),
|
||||
prompt="p",
|
||||
run_config=RunConfig(),
|
||||
)
|
||||
assert result.quality_score == 1.0
|
||||
|
||||
|
||||
class TestEmbeddingSimilarityJudge:
|
||||
def test_scores_cosine_similarity(self):
|
||||
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0], [0.5, 0.0]])
|
||||
result = EmbeddingSimilarityJudge(embedding_adapter).judge(
|
||||
response("baseline"),
|
||||
response("candidate"),
|
||||
prompt="p",
|
||||
run_config=RunConfig(),
|
||||
)
|
||||
assert result.quality_score == 1.0
|
||||
assert embedding_adapter.seen_texts == ["baseline", "candidate"]
|
||||
|
||||
def test_negative_similarity_clamps_to_zero(self):
|
||||
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0], [-1.0, 0.0]])
|
||||
result = EmbeddingSimilarityJudge(embedding_adapter).judge(
|
||||
response("baseline"),
|
||||
response("candidate"),
|
||||
prompt="p",
|
||||
run_config=RunConfig(),
|
||||
)
|
||||
assert result.quality_score == 0.0
|
||||
|
||||
def test_wrong_embedding_count_raises(self):
|
||||
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0]])
|
||||
with pytest.raises(ValueError, match="two embeddings"):
|
||||
EmbeddingSimilarityJudge(embedding_adapter).judge(
|
||||
response("baseline"),
|
||||
response("candidate"),
|
||||
prompt="p",
|
||||
run_config=RunConfig(),
|
||||
)
|
||||
|
||||
|
||||
class TestLLMJudge:
|
||||
def test_parses_json_judge_response(self):
|
||||
judge_adapter = MockLLMAdapter(
|
||||
mock_response='{"quality_score": 0.75, "notes": "mostly equivalent"}'
|
||||
)
|
||||
run_config = RunConfig(model_params={"existing": True})
|
||||
|
||||
result = LLMJudge(judge_adapter).judge(
|
||||
response("baseline answer"),
|
||||
response("candidate answer"),
|
||||
prompt="original prompt",
|
||||
run_config=run_config,
|
||||
)
|
||||
|
||||
assert result.quality_score == 0.75
|
||||
assert result.notes == "mostly equivalent"
|
||||
assert "baseline answer" in judge_adapter.last_prompt
|
||||
assert "candidate answer" in judge_adapter.last_prompt
|
||||
assert judge_adapter.last_config.temperature == 0.0
|
||||
assert judge_adapter.last_config.model_params["existing"] is True
|
||||
assert judge_adapter.last_config.model_params["seed"] == 0
|
||||
assert judge_adapter.last_config.budget_tracker is None
|
||||
|
||||
def test_extracts_json_from_wrapped_response(self):
|
||||
judge_adapter = MockLLMAdapter(
|
||||
mock_response='Here is the result: {"quality_score": 1, "notes": "same"}'
|
||||
)
|
||||
result = LLMJudge(judge_adapter).judge(
|
||||
response("a"),
|
||||
response("a"),
|
||||
prompt="p",
|
||||
run_config=RunConfig(),
|
||||
)
|
||||
assert result.quality_score == 1.0
|
||||
assert result.notes == "same"
|
||||
|
||||
def test_invalid_judge_response_raises(self):
|
||||
judge_adapter = MockLLMAdapter(mock_response="not json")
|
||||
with pytest.raises(ValueError, match="JSON"):
|
||||
LLMJudge(judge_adapter).judge(
|
||||
response("a"),
|
||||
response("b"),
|
||||
prompt="p",
|
||||
run_config=RunConfig(),
|
||||
)
|
||||
|
||||
|
||||
class TestPairedGrader:
|
||||
def test_runs_both_adapters_and_preserves_responses(self):
|
||||
baseline = MockLLMAdapter(mock_response="same")
|
||||
candidate = MockLLMAdapter(mock_response="same")
|
||||
result = PairedGrader(ExactMatchJudge()).grade(
|
||||
baseline,
|
||||
candidate,
|
||||
"prompt",
|
||||
RunConfig(model_name="mock-model"),
|
||||
)
|
||||
|
||||
assert result.quality_score == 1.0
|
||||
assert result.baseline_response.content == "same"
|
||||
assert result.candidate_response.content == "same"
|
||||
assert baseline.call_count == 1
|
||||
assert candidate.call_count == 1
|
||||
assert baseline.last_prompt == "prompt"
|
||||
assert candidate.last_prompt == "prompt"
|
||||
|
||||
def test_uses_custom_judge(self):
|
||||
baseline = MockLLMAdapter(mock_response="a")
|
||||
candidate = MockLLMAdapter(mock_response="b")
|
||||
result = PairedGrader(ExactMatchJudge()).grade(
|
||||
baseline,
|
||||
candidate,
|
||||
"prompt",
|
||||
RunConfig(),
|
||||
)
|
||||
assert result.quality_score == 0.0
|
||||
Reference in New Issue
Block a user