Evaluation pipeline builds prompts from entity metadata, delegates to BatchEvaluator, parses structured LLM responses into ScoreEntry objects, and writes evaluation files. CLI: 'markitect infospace evaluate' with --provider, --entity, --chapter filters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
225 lines
7.3 KiB
Python
225 lines
7.3 KiB
Python
"""Tests for markitect.infospace.evaluate."""
|
|
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from markitect.infospace.config import InfospaceConfig, TopicConfig
|
|
from markitect.infospace.evaluate import (
|
|
build_evaluation_prompt,
|
|
content_digest,
|
|
parse_evaluation_response,
|
|
run_entity_evaluation,
|
|
)
|
|
from markitect.infospace.evaluation import ScoreEntry
|
|
from markitect.infospace.models import EntityMeta
|
|
from markitect.prompts.execution.llm_adapter import MockLLMAdapter
|
|
from markitect.prompts.execution.models import RunConfig
|
|
|
|
|
|
# ── Helpers ──────────────────────────────────────────────────────────
|
|
|
|
|
|
def _entity(**overrides) -> EntityMeta:
|
|
defaults = dict(
|
|
slug="division-of-labour",
|
|
title="Division Of Labour",
|
|
h1_raw="Division Of Labour",
|
|
definition="Splitting work into specialised tasks.",
|
|
source_chapter="Book I Chapter 1",
|
|
context="Smith introduces the concept early.",
|
|
domain="Production",
|
|
source_path="entities/division-of-labour.md",
|
|
)
|
|
defaults.update(overrides)
|
|
return EntityMeta(**defaults)
|
|
|
|
|
|
def _config() -> InfospaceConfig:
|
|
return InfospaceConfig(topic=TopicConfig(name="The Wealth of Nations"))
|
|
|
|
|
|
_MOCK_RESPONSE = """\
|
|
DIMENSION: definition_precision
|
|
SCORE: 4.5
|
|
RATIONALE: Clear and specific definition of the concept.
|
|
|
|
DIMENSION: source_grounding
|
|
SCORE: 4.0
|
|
RATIONALE: Well grounded in Smith's text.
|
|
|
|
DIMENSION: domain_relevance
|
|
SCORE: 5.0
|
|
RATIONALE: Directly relevant to production economics.
|
|
"""
|
|
|
|
|
|
# ── build_evaluation_prompt ──────────────────────────────────────────
|
|
|
|
|
|
class TestBuildPrompt:
|
|
def test_contains_entity_fields(self):
|
|
entity = _entity()
|
|
prompt = build_evaluation_prompt(entity, "Test Topic")
|
|
assert "division-of-labour" in prompt
|
|
assert "Division Of Labour" in prompt
|
|
assert "Production" in prompt
|
|
assert "Splitting work" in prompt
|
|
|
|
def test_contains_topic(self):
|
|
prompt = build_evaluation_prompt(_entity(), "WoN")
|
|
assert "WoN" in prompt
|
|
|
|
def test_contains_dimensions(self):
|
|
prompt = build_evaluation_prompt(_entity(), "T")
|
|
assert "definition_precision" in prompt
|
|
assert "source_grounding" in prompt
|
|
|
|
def test_custom_dimensions(self):
|
|
prompt = build_evaluation_prompt(
|
|
_entity(), "T", dimensions=["novelty", "coherence"]
|
|
)
|
|
assert "novelty" in prompt
|
|
assert "coherence" in prompt
|
|
assert "definition_precision" not in prompt
|
|
|
|
def test_handles_missing_fields(self):
|
|
entity = _entity(definition="", context="", domain="")
|
|
prompt = build_evaluation_prompt(entity, "T")
|
|
assert "(no definition)" in prompt
|
|
assert "(no context)" in prompt
|
|
assert "(unspecified)" in prompt
|
|
|
|
|
|
# ── content_digest ───────────────────────────────────────────────────
|
|
|
|
|
|
class TestContentDigest:
|
|
def test_deterministic(self):
|
|
e = _entity()
|
|
assert content_digest(e) == content_digest(e)
|
|
|
|
def test_changes_with_content(self):
|
|
e1 = _entity(definition="A")
|
|
e2 = _entity(definition="B")
|
|
assert content_digest(e1) != content_digest(e2)
|
|
|
|
|
|
# ── parse_evaluation_response ────────────────────────────────────────
|
|
|
|
|
|
class TestParseResponse:
|
|
def test_parses_three_dimensions(self):
|
|
scores = parse_evaluation_response(_MOCK_RESPONSE)
|
|
assert len(scores) == 3
|
|
|
|
def test_correct_names(self):
|
|
scores = parse_evaluation_response(_MOCK_RESPONSE)
|
|
names = [s.name for s in scores]
|
|
assert "definition_precision" in names
|
|
assert "source_grounding" in names
|
|
assert "domain_relevance" in names
|
|
|
|
def test_correct_scores(self):
|
|
scores = parse_evaluation_response(_MOCK_RESPONSE)
|
|
by_name = {s.name: s for s in scores}
|
|
assert by_name["definition_precision"].value == 4.5
|
|
assert by_name["source_grounding"].value == 4.0
|
|
assert by_name["domain_relevance"].value == 5.0
|
|
|
|
def test_correct_rationales(self):
|
|
scores = parse_evaluation_response(_MOCK_RESPONSE)
|
|
by_name = {s.name: s for s in scores}
|
|
assert "Clear" in by_name["definition_precision"].rationale
|
|
|
|
def test_empty_response(self):
|
|
scores = parse_evaluation_response("")
|
|
assert scores == []
|
|
|
|
def test_malformed_score_skipped(self):
|
|
text = "DIMENSION: x\nSCORE: not-a-number\nRATIONALE: oops"
|
|
scores = parse_evaluation_response(text)
|
|
assert len(scores) == 0
|
|
|
|
|
|
# ── run_entity_evaluation ────────────────────────────────────────────
|
|
|
|
|
|
class TestRunEntityEvaluation:
|
|
def test_evaluates_entities(self, tmp_path):
|
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
|
cfg = _config()
|
|
entities = [_entity(), _entity(slug="pin-factory", title="Pin Factory")]
|
|
|
|
summary = run_entity_evaluation(
|
|
config=cfg,
|
|
entities=entities,
|
|
adapter=adapter,
|
|
output_dir=tmp_path / "evals",
|
|
)
|
|
assert summary.total == 2
|
|
assert summary.succeeded == 2
|
|
assert adapter.call_count == 2
|
|
|
|
def test_writes_evaluation_files(self, tmp_path):
|
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
|
cfg = _config()
|
|
entities = [_entity()]
|
|
|
|
run_entity_evaluation(
|
|
config=cfg,
|
|
entities=entities,
|
|
adapter=adapter,
|
|
output_dir=tmp_path / "evals",
|
|
)
|
|
eval_file = tmp_path / "evals" / "division-of-labour.md"
|
|
assert eval_file.exists()
|
|
text = eval_file.read_text()
|
|
assert "definition_precision" in text
|
|
|
|
def test_incremental_skip(self, tmp_path):
|
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
|
cfg = _config()
|
|
entity = _entity()
|
|
digest = content_digest(entity)
|
|
|
|
summary = run_entity_evaluation(
|
|
config=cfg,
|
|
entities=[entity],
|
|
adapter=adapter,
|
|
output_dir=tmp_path,
|
|
previous_digests={entity.slug: digest},
|
|
)
|
|
assert summary.skipped == 1
|
|
assert adapter.call_count == 0
|
|
|
|
def test_progress_callback_called(self, tmp_path):
|
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
|
cfg = _config()
|
|
calls = []
|
|
|
|
run_entity_evaluation(
|
|
config=cfg,
|
|
entities=[_entity()],
|
|
adapter=adapter,
|
|
output_dir=tmp_path,
|
|
progress_callback=lambda d, t, r: calls.append((d, t, r.key)),
|
|
)
|
|
assert len(calls) == 1
|
|
assert calls[0] == (1, 1, "division-of-labour")
|
|
|
|
def test_passes_run_config(self, tmp_path):
|
|
adapter = MockLLMAdapter(_MOCK_RESPONSE)
|
|
cfg = _config()
|
|
rc = RunConfig(temperature=0.1, max_tokens=500)
|
|
|
|
run_entity_evaluation(
|
|
config=cfg,
|
|
entities=[_entity()],
|
|
adapter=adapter,
|
|
run_config=rc,
|
|
output_dir=tmp_path,
|
|
)
|
|
assert adapter.last_config.temperature == 0.1
|