""" Tests for baseline grading and built-in judges. """ import pytest from llm_connect.adapter import MockLLMAdapter from llm_connect.embedding_adapter import EmbeddingAdapter from llm_connect.grading import ( EmbeddingSimilarityJudge, ExactMatchJudge, GradingResult, LLMJudge, PairedGrader, ) from llm_connect.models import LLMResponse, RunConfig class StaticEmbeddingAdapter(EmbeddingAdapter): def __init__(self, embeddings: list[list[float]]): self.embeddings = embeddings self.seen_texts: list[str] | None = None def embed(self, texts: list[str]) -> list[list[float]]: self.seen_texts = texts return self.embeddings def validate(self) -> bool: return True def response(content: str, model: str = "m") -> LLMResponse: return LLMResponse(content=content, model=model) class TestGradingResult: def test_score_must_be_between_zero_and_one(self): with pytest.raises(ValueError, match="quality_score"): GradingResult( quality_score=1.5, notes="bad", grader_id="g", baseline_response=response("a"), candidate_response=response("b"), ) def test_grader_id_must_be_non_empty(self): with pytest.raises(ValueError, match="grader_id"): GradingResult( quality_score=1.0, notes="ok", grader_id="", baseline_response=response("a"), candidate_response=response("a"), ) class TestExactMatchJudge: def test_scores_one_for_normalised_match(self): judge = ExactMatchJudge() result = judge.judge( response("hello world"), response("hello world"), prompt="p", run_config=RunConfig(), ) assert result.quality_score == 1.0 assert result.baseline_response.content == "hello world" assert result.candidate_response.content == "hello world" def test_scores_zero_for_difference(self): result = ExactMatchJudge().judge( response("hello"), response("goodbye"), prompt="p", run_config=RunConfig(), ) assert result.quality_score == 0.0 def test_case_insensitive_mode(self): result = ExactMatchJudge(case_sensitive=False).judge( response("Hello"), response("hello"), prompt="p", run_config=RunConfig(), ) assert result.quality_score == 1.0 class TestEmbeddingSimilarityJudge: def test_scores_cosine_similarity(self): embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0], [0.5, 0.0]]) result = EmbeddingSimilarityJudge(embedding_adapter).judge( response("baseline"), response("candidate"), prompt="p", run_config=RunConfig(), ) assert result.quality_score == 1.0 assert embedding_adapter.seen_texts == ["baseline", "candidate"] def test_negative_similarity_clamps_to_zero(self): embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0], [-1.0, 0.0]]) result = EmbeddingSimilarityJudge(embedding_adapter).judge( response("baseline"), response("candidate"), prompt="p", run_config=RunConfig(), ) assert result.quality_score == 0.0 def test_wrong_embedding_count_raises(self): embedding_adapter = StaticEmbeddingAdapter([[1.0, 0.0]]) with pytest.raises(ValueError, match="two embeddings"): EmbeddingSimilarityJudge(embedding_adapter).judge( response("baseline"), response("candidate"), prompt="p", run_config=RunConfig(), ) class TestLLMJudge: def test_parses_json_judge_response(self): judge_adapter = MockLLMAdapter( mock_response='{"quality_score": 0.75, "notes": "mostly equivalent"}' ) run_config = RunConfig(model_params={"existing": True}) result = LLMJudge(judge_adapter).judge( response("baseline answer"), response("candidate answer"), prompt="original prompt", run_config=run_config, ) assert result.quality_score == 0.75 assert result.notes == "mostly equivalent" assert "baseline answer" in judge_adapter.last_prompt assert "candidate answer" in judge_adapter.last_prompt assert judge_adapter.last_config.temperature == 0.0 assert judge_adapter.last_config.model_params["existing"] is True assert judge_adapter.last_config.model_params["seed"] == 0 assert judge_adapter.last_config.budget_tracker is None def test_extracts_json_from_wrapped_response(self): judge_adapter = MockLLMAdapter( mock_response='Here is the result: {"quality_score": 1, "notes": "same"}' ) result = LLMJudge(judge_adapter).judge( response("a"), response("a"), prompt="p", run_config=RunConfig(), ) assert result.quality_score == 1.0 assert result.notes == "same" def test_invalid_judge_response_raises(self): judge_adapter = MockLLMAdapter(mock_response="not json") with pytest.raises(ValueError, match="JSON"): LLMJudge(judge_adapter).judge( response("a"), response("b"), prompt="p", run_config=RunConfig(), ) class TestPairedGrader: def test_runs_both_adapters_and_preserves_responses(self): baseline = MockLLMAdapter(mock_response="same") candidate = MockLLMAdapter(mock_response="same") result = PairedGrader(ExactMatchJudge()).grade( baseline, candidate, "prompt", RunConfig(model_name="mock-model"), ) assert result.quality_score == 1.0 assert result.baseline_response.content == "same" assert result.candidate_response.content == "same" assert baseline.call_count == 1 assert candidate.call_count == 1 assert baseline.last_prompt == "prompt" assert candidate.last_prompt == "prompt" def test_uses_custom_judge(self): baseline = MockLLMAdapter(mock_response="a") candidate = MockLLMAdapter(mock_response="b") result = PairedGrader(ExactMatchJudge()).grade( baseline, candidate, "prompt", RunConfig(), ) assert result.quality_score == 0.0