"""Tests for markitect.infospace.evaluate.""" from datetime import datetime from pathlib import Path import pytest from markitect.infospace.config import InfospaceConfig, TopicConfig from markitect.infospace.evaluate import ( build_evaluation_prompt, content_digest, parse_evaluation_response, run_entity_evaluation, ) from markitect.infospace.evaluation import ScoreEntry from markitect.infospace.models import EntityMeta from markitect.prompts.execution.llm_adapter import MockLLMAdapter from markitect.prompts.execution.models import RunConfig # ── Helpers ────────────────────────────────────────────────────────── def _entity(**overrides) -> EntityMeta: defaults = dict( slug="division-of-labour", title="Division Of Labour", h1_raw="Division Of Labour", definition="Splitting work into specialised tasks.", source_chapter="Book I Chapter 1", context="Smith introduces the concept early.", domain="Production", source_path="entities/division-of-labour.md", ) defaults.update(overrides) return EntityMeta(**defaults) def _config() -> InfospaceConfig: return InfospaceConfig(topic=TopicConfig(name="The Wealth of Nations")) _MOCK_RESPONSE = """\ DIMENSION: definition_precision SCORE: 4.5 RATIONALE: Clear and specific definition of the concept. DIMENSION: source_grounding SCORE: 4.0 RATIONALE: Well grounded in Smith's text. DIMENSION: domain_relevance SCORE: 5.0 RATIONALE: Directly relevant to production economics. """ # ── build_evaluation_prompt ────────────────────────────────────────── class TestBuildPrompt: def test_contains_entity_fields(self): entity = _entity() prompt = build_evaluation_prompt(entity, "Test Topic") assert "division-of-labour" in prompt assert "Division Of Labour" in prompt assert "Production" in prompt assert "Splitting work" in prompt def test_contains_topic(self): prompt = build_evaluation_prompt(_entity(), "WoN") assert "WoN" in prompt def test_contains_dimensions(self): prompt = build_evaluation_prompt(_entity(), "T") assert "definition_precision" in prompt assert "source_grounding" in prompt def test_custom_dimensions(self): prompt = build_evaluation_prompt( _entity(), "T", dimensions=["novelty", "coherence"] ) assert "novelty" in prompt assert "coherence" in prompt assert "definition_precision" not in prompt def test_handles_missing_fields(self): entity = _entity(definition="", context="", domain="") prompt = build_evaluation_prompt(entity, "T") assert "(no definition)" in prompt assert "(no context)" in prompt assert "(unspecified)" in prompt # ── content_digest ─────────────────────────────────────────────────── class TestContentDigest: def test_deterministic(self): e = _entity() assert content_digest(e) == content_digest(e) def test_changes_with_content(self): e1 = _entity(definition="A") e2 = _entity(definition="B") assert content_digest(e1) != content_digest(e2) # ── parse_evaluation_response ──────────────────────────────────────── class TestParseResponse: def test_parses_three_dimensions(self): scores = parse_evaluation_response(_MOCK_RESPONSE) assert len(scores) == 3 def test_correct_names(self): scores = parse_evaluation_response(_MOCK_RESPONSE) names = [s.name for s in scores] assert "definition_precision" in names assert "source_grounding" in names assert "domain_relevance" in names def test_correct_scores(self): scores = parse_evaluation_response(_MOCK_RESPONSE) by_name = {s.name: s for s in scores} assert by_name["definition_precision"].value == 4.5 assert by_name["source_grounding"].value == 4.0 assert by_name["domain_relevance"].value == 5.0 def test_correct_rationales(self): scores = parse_evaluation_response(_MOCK_RESPONSE) by_name = {s.name: s for s in scores} assert "Clear" in by_name["definition_precision"].rationale def test_empty_response(self): scores = parse_evaluation_response("") assert scores == [] def test_malformed_score_skipped(self): text = "DIMENSION: x\nSCORE: not-a-number\nRATIONALE: oops" scores = parse_evaluation_response(text) assert len(scores) == 0 # ── run_entity_evaluation ──────────────────────────────────────────── class TestRunEntityEvaluation: def test_evaluates_entities(self, tmp_path): adapter = MockLLMAdapter(_MOCK_RESPONSE) cfg = _config() entities = [_entity(), _entity(slug="pin-factory", title="Pin Factory")] summary = run_entity_evaluation( config=cfg, entities=entities, adapter=adapter, output_dir=tmp_path / "evals", ) assert summary.total == 2 assert summary.succeeded == 2 assert adapter.call_count == 2 def test_writes_evaluation_files(self, tmp_path): adapter = MockLLMAdapter(_MOCK_RESPONSE) cfg = _config() entities = [_entity()] run_entity_evaluation( config=cfg, entities=entities, adapter=adapter, output_dir=tmp_path / "evals", ) eval_file = tmp_path / "evals" / "division-of-labour.md" assert eval_file.exists() text = eval_file.read_text() assert "definition_precision" in text def test_incremental_skip(self, tmp_path): adapter = MockLLMAdapter(_MOCK_RESPONSE) cfg = _config() entity = _entity() digest = content_digest(entity) summary = run_entity_evaluation( config=cfg, entities=[entity], adapter=adapter, output_dir=tmp_path, previous_digests={entity.slug: digest}, ) assert summary.skipped == 1 assert adapter.call_count == 0 def test_progress_callback_called(self, tmp_path): adapter = MockLLMAdapter(_MOCK_RESPONSE) cfg = _config() calls = [] run_entity_evaluation( config=cfg, entities=[_entity()], adapter=adapter, output_dir=tmp_path, progress_callback=lambda d, t, r: calls.append((d, t, r.key)), ) assert len(calls) == 1 assert calls[0] == (1, 1, "division-of-labour") def test_passes_run_config(self, tmp_path): adapter = MockLLMAdapter(_MOCK_RESPONSE) cfg = _config() rc = RunConfig(temperature=0.1, max_tokens=500) run_entity_evaluation( config=cfg, entities=[_entity()], adapter=adapter, run_config=rc, output_dir=tmp_path, ) assert adapter.last_config.temperature == 0.1