Files
llm-connect/tests/test_grading.py
tegwick c4ad4bb9f2
Some checks failed
CI / test (3.10) (push) Has been cancelled
CI / test (3.11) (push) Has been cancelled
CI / test (3.12) (push) Has been cancelled
Add adaptive cost-quality routing primitives
2026-05-17 21:32:27 +02:00

199 lines
6.5 KiB
Python

"""
Tests for baseline grading and built-in judges.
"""
import pytest
from llm_connect.adapter import MockLLMAdapter
from llm_connect.embedding_adapter import EmbeddingAdapter
from llm_connect.grading import (
EmbeddingSimilarityJudge,
ExactMatchJudge,
GradingResult,
LLMJudge,
PairedGrader,
)
from llm_connect.models import LLMResponse, RunConfig
class StaticEmbeddingAdapter(EmbeddingAdapter):
def __init__(self, embeddings: list[list[float]]):
self.embeddings = embeddings
self.seen_texts: list[str] | None = None
def embed(self, texts: list[str]) -> list[list[float]]:
self.seen_texts = texts
return self.embeddings
def validate(self) -> bool:
return True
def response(content: str, model: str = "m") -> LLMResponse:
return LLMResponse(content=content, model=model)
class TestGradingResult:
def test_score_must_be_between_zero_and_one(self):
with pytest.raises(ValueError, match="quality_score"):
GradingResult(
quality_score=1.5,
notes="bad",
grader_id="g",
baseline_response=response("a"),
candidate_response=response("b"),
)
def test_grader_id_must_be_non_empty(self):
with pytest.raises(ValueError, match="grader_id"):
GradingResult(
quality_score=1.0,
notes="ok",
grader_id="",
baseline_response=response("a"),
candidate_response=response("a"),
)
class TestExactMatchJudge:
def test_scores_one_for_normalised_match(self):
judge = ExactMatchJudge()
result = judge.judge(
response("hello world"),
response("hello world"),
prompt="p",
run_config=RunConfig(),
)
assert result.quality_score == 1.0
assert result.baseline_response.content == "hello world"
assert result.candidate_response.content == "hello world"
def test_scores_zero_for_difference(self):
result = ExactMatchJudge().judge(
response("hello"),
response("goodbye"),
prompt="p",
run_config=RunConfig(),
)
assert result.quality_score == 0.0
def test_case_insensitive_mode(self):
result = ExactMatchJudge(case_sensitive=False).judge(
response("Hello"),
response("hello"),
prompt="p",
run_config=RunConfig(),
)
assert result.quality_score == 1.0
class TestEmbeddingSimilarityJudge:
def test_scores_cosine_similarity(self):
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0], [0.5, 0.0]])
result = EmbeddingSimilarityJudge(embedding_adapter).judge(
response("baseline"),
response("candidate"),
prompt="p",
run_config=RunConfig(),
)
assert result.quality_score == 1.0
assert embedding_adapter.seen_texts == ["baseline", "candidate"]
def test_negative_similarity_clamps_to_zero(self):
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0], [-1.0, 0.0]])
result = EmbeddingSimilarityJudge(embedding_adapter).judge(
response("baseline"),
response("candidate"),
prompt="p",
run_config=RunConfig(),
)
assert result.quality_score == 0.0
def test_wrong_embedding_count_raises(self):
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0]])
with pytest.raises(ValueError, match="two embeddings"):
EmbeddingSimilarityJudge(embedding_adapter).judge(
response("baseline"),
response("candidate"),
prompt="p",
run_config=RunConfig(),
)
class TestLLMJudge:
def test_parses_json_judge_response(self):
judge_adapter = MockLLMAdapter(
mock_response='{"quality_score": 0.75, "notes": "mostly equivalent"}'
)
run_config = RunConfig(model_params={"existing": True})
result = LLMJudge(judge_adapter).judge(
response("baseline answer"),
response("candidate answer"),
prompt="original prompt",
run_config=run_config,
)
assert result.quality_score == 0.75
assert result.notes == "mostly equivalent"
assert "baseline answer" in judge_adapter.last_prompt
assert "candidate answer" in judge_adapter.last_prompt
assert judge_adapter.last_config.temperature == 0.0
assert judge_adapter.last_config.model_params["existing"] is True
assert judge_adapter.last_config.model_params["seed"] == 0
assert judge_adapter.last_config.budget_tracker is None
def test_extracts_json_from_wrapped_response(self):
judge_adapter = MockLLMAdapter(
mock_response='Here is the result: {"quality_score": 1, "notes": "same"}'
)
result = LLMJudge(judge_adapter).judge(
response("a"),
response("a"),
prompt="p",
run_config=RunConfig(),
)
assert result.quality_score == 1.0
assert result.notes == "same"
def test_invalid_judge_response_raises(self):
judge_adapter = MockLLMAdapter(mock_response="not json")
with pytest.raises(ValueError, match="JSON"):
LLMJudge(judge_adapter).judge(
response("a"),
response("b"),
prompt="p",
run_config=RunConfig(),
)
class TestPairedGrader:
def test_runs_both_adapters_and_preserves_responses(self):
baseline = MockLLMAdapter(mock_response="same")
candidate = MockLLMAdapter(mock_response="same")
result = PairedGrader(ExactMatchJudge()).grade(
baseline,
candidate,
"prompt",
RunConfig(model_name="mock-model"),
)
assert result.quality_score == 1.0
assert result.baseline_response.content == "same"
assert result.candidate_response.content == "same"
assert baseline.call_count == 1
assert candidate.call_count == 1
assert baseline.last_prompt == "prompt"
assert candidate.last_prompt == "prompt"
def test_uses_custom_judge(self):
baseline = MockLLMAdapter(mock_response="a")
candidate = MockLLMAdapter(mock_response="b")
result = PairedGrader(ExactMatchJudge()).grade(
baseline,
candidate,
"prompt",
RunConfig(),
)
assert result.quality_score == 0.0