generated from coulomb/repo-seed
199 lines
6.5 KiB
Python
199 lines
6.5 KiB
Python
"""
|
|
Tests for baseline grading and built-in judges.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from llm_connect.adapter import MockLLMAdapter
|
|
from llm_connect.embedding_adapter import EmbeddingAdapter
|
|
from llm_connect.grading import (
|
|
EmbeddingSimilarityJudge,
|
|
ExactMatchJudge,
|
|
GradingResult,
|
|
LLMJudge,
|
|
PairedGrader,
|
|
)
|
|
from llm_connect.models import LLMResponse, RunConfig
|
|
|
|
|
|
class StaticEmbeddingAdapter(EmbeddingAdapter):
|
|
def __init__(self, embeddings: list[list[float]]):
|
|
self.embeddings = embeddings
|
|
self.seen_texts: list[str] | None = None
|
|
|
|
def embed(self, texts: list[str]) -> list[list[float]]:
|
|
self.seen_texts = texts
|
|
return self.embeddings
|
|
|
|
def validate(self) -> bool:
|
|
return True
|
|
|
|
|
|
def response(content: str, model: str = "m") -> LLMResponse:
|
|
return LLMResponse(content=content, model=model)
|
|
|
|
|
|
class TestGradingResult:
|
|
def test_score_must_be_between_zero_and_one(self):
|
|
with pytest.raises(ValueError, match="quality_score"):
|
|
GradingResult(
|
|
quality_score=1.5,
|
|
notes="bad",
|
|
grader_id="g",
|
|
baseline_response=response("a"),
|
|
candidate_response=response("b"),
|
|
)
|
|
|
|
def test_grader_id_must_be_non_empty(self):
|
|
with pytest.raises(ValueError, match="grader_id"):
|
|
GradingResult(
|
|
quality_score=1.0,
|
|
notes="ok",
|
|
grader_id="",
|
|
baseline_response=response("a"),
|
|
candidate_response=response("a"),
|
|
)
|
|
|
|
|
|
class TestExactMatchJudge:
|
|
def test_scores_one_for_normalised_match(self):
|
|
judge = ExactMatchJudge()
|
|
result = judge.judge(
|
|
response("hello world"),
|
|
response("hello world"),
|
|
prompt="p",
|
|
run_config=RunConfig(),
|
|
)
|
|
assert result.quality_score == 1.0
|
|
assert result.baseline_response.content == "hello world"
|
|
assert result.candidate_response.content == "hello world"
|
|
|
|
def test_scores_zero_for_difference(self):
|
|
result = ExactMatchJudge().judge(
|
|
response("hello"),
|
|
response("goodbye"),
|
|
prompt="p",
|
|
run_config=RunConfig(),
|
|
)
|
|
assert result.quality_score == 0.0
|
|
|
|
def test_case_insensitive_mode(self):
|
|
result = ExactMatchJudge(case_sensitive=False).judge(
|
|
response("Hello"),
|
|
response("hello"),
|
|
prompt="p",
|
|
run_config=RunConfig(),
|
|
)
|
|
assert result.quality_score == 1.0
|
|
|
|
|
|
class TestEmbeddingSimilarityJudge:
|
|
def test_scores_cosine_similarity(self):
|
|
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0], [0.5, 0.0]])
|
|
result = EmbeddingSimilarityJudge(embedding_adapter).judge(
|
|
response("baseline"),
|
|
response("candidate"),
|
|
prompt="p",
|
|
run_config=RunConfig(),
|
|
)
|
|
assert result.quality_score == 1.0
|
|
assert embedding_adapter.seen_texts == ["baseline", "candidate"]
|
|
|
|
def test_negative_similarity_clamps_to_zero(self):
|
|
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0], [-1.0, 0.0]])
|
|
result = EmbeddingSimilarityJudge(embedding_adapter).judge(
|
|
response("baseline"),
|
|
response("candidate"),
|
|
prompt="p",
|
|
run_config=RunConfig(),
|
|
)
|
|
assert result.quality_score == 0.0
|
|
|
|
def test_wrong_embedding_count_raises(self):
|
|
embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0]])
|
|
with pytest.raises(ValueError, match="two embeddings"):
|
|
EmbeddingSimilarityJudge(embedding_adapter).judge(
|
|
response("baseline"),
|
|
response("candidate"),
|
|
prompt="p",
|
|
run_config=RunConfig(),
|
|
)
|
|
|
|
|
|
class TestLLMJudge:
|
|
def test_parses_json_judge_response(self):
|
|
judge_adapter = MockLLMAdapter(
|
|
mock_response='{"quality_score": 0.75, "notes": "mostly equivalent"}'
|
|
)
|
|
run_config = RunConfig(model_params={"existing": True})
|
|
|
|
result = LLMJudge(judge_adapter).judge(
|
|
response("baseline answer"),
|
|
response("candidate answer"),
|
|
prompt="original prompt",
|
|
run_config=run_config,
|
|
)
|
|
|
|
assert result.quality_score == 0.75
|
|
assert result.notes == "mostly equivalent"
|
|
assert "baseline answer" in judge_adapter.last_prompt
|
|
assert "candidate answer" in judge_adapter.last_prompt
|
|
assert judge_adapter.last_config.temperature == 0.0
|
|
assert judge_adapter.last_config.model_params["existing"] is True
|
|
assert judge_adapter.last_config.model_params["seed"] == 0
|
|
assert judge_adapter.last_config.budget_tracker is None
|
|
|
|
def test_extracts_json_from_wrapped_response(self):
|
|
judge_adapter = MockLLMAdapter(
|
|
mock_response='Here is the result: {"quality_score": 1, "notes": "same"}'
|
|
)
|
|
result = LLMJudge(judge_adapter).judge(
|
|
response("a"),
|
|
response("a"),
|
|
prompt="p",
|
|
run_config=RunConfig(),
|
|
)
|
|
assert result.quality_score == 1.0
|
|
assert result.notes == "same"
|
|
|
|
def test_invalid_judge_response_raises(self):
|
|
judge_adapter = MockLLMAdapter(mock_response="not json")
|
|
with pytest.raises(ValueError, match="JSON"):
|
|
LLMJudge(judge_adapter).judge(
|
|
response("a"),
|
|
response("b"),
|
|
prompt="p",
|
|
run_config=RunConfig(),
|
|
)
|
|
|
|
|
|
class TestPairedGrader:
|
|
def test_runs_both_adapters_and_preserves_responses(self):
|
|
baseline = MockLLMAdapter(mock_response="same")
|
|
candidate = MockLLMAdapter(mock_response="same")
|
|
result = PairedGrader(ExactMatchJudge()).grade(
|
|
baseline,
|
|
candidate,
|
|
"prompt",
|
|
RunConfig(model_name="mock-model"),
|
|
)
|
|
|
|
assert result.quality_score == 1.0
|
|
assert result.baseline_response.content == "same"
|
|
assert result.candidate_response.content == "same"
|
|
assert baseline.call_count == 1
|
|
assert candidate.call_count == 1
|
|
assert baseline.last_prompt == "prompt"
|
|
assert candidate.last_prompt == "prompt"
|
|
|
|
def test_uses_custom_judge(self):
|
|
baseline = MockLLMAdapter(mock_response="a")
|
|
candidate = MockLLMAdapter(mock_response="b")
|
|
result = PairedGrader(ExactMatchJudge()).grade(
|
|
baseline,
|
|
candidate,
|
|
"prompt",
|
|
RunConfig(),
|
|
)
|
|
assert result.quality_score == 0.0
|