feat(infospace): add per-entity evaluation pipeline and CLI command (S2.3)

Evaluation pipeline builds prompts from entity metadata, delegates to BatchEvaluator, parses structured LLM responses into ScoreEntry objects, and writes evaluation files. CLI: 'markitect infospace evaluate' with --provider, --entity, --chapter filters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:48:34 +01:00
parent 3726503adb
commit 3461d2f354
3 changed files with 504 additions and 0 deletions
--- a/tests/unit/infospace/test_evaluate.py
+++ b/tests/unit/infospace/test_evaluate.py
@@ -0,0 +1,224 @@
+"""Tests for markitect.infospace.evaluate."""
+
+from datetime import datetime
+from pathlib import Path
+
+import pytest
+
+from markitect.infospace.config import InfospaceConfig, TopicConfig
+from markitect.infospace.evaluate import (
+    build_evaluation_prompt,
+    content_digest,
+    parse_evaluation_response,
+    run_entity_evaluation,
+)
+from markitect.infospace.evaluation import ScoreEntry
+from markitect.infospace.models import EntityMeta
+from markitect.prompts.execution.llm_adapter import MockLLMAdapter
+from markitect.prompts.execution.models import RunConfig
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+
+
+def _entity(**overrides) -> EntityMeta:
+    defaults = dict(
+        slug="division-of-labour",
+        title="Division Of Labour",
+        h1_raw="Division Of Labour",
+        definition="Splitting work into specialised tasks.",
+        source_chapter="Book I Chapter 1",
+        context="Smith introduces the concept early.",
+        domain="Production",
+        source_path="entities/division-of-labour.md",
+    )
+    defaults.update(overrides)
+    return EntityMeta(**defaults)
+
+
+def _config() -> InfospaceConfig:
+    return InfospaceConfig(topic=TopicConfig(name="The Wealth of Nations"))
+
+
+_MOCK_RESPONSE = """\
+DIMENSION: definition_precision
+SCORE: 4.5
+RATIONALE: Clear and specific definition of the concept.
+
+DIMENSION: source_grounding
+SCORE: 4.0
+RATIONALE: Well grounded in Smith's text.
+
+DIMENSION: domain_relevance
+SCORE: 5.0
+RATIONALE: Directly relevant to production economics.
+"""
+
+
+# ── build_evaluation_prompt ──────────────────────────────────────────
+
+
+class TestBuildPrompt:
+    def test_contains_entity_fields(self):
+        entity = _entity()
+        prompt = build_evaluation_prompt(entity, "Test Topic")
+        assert "division-of-labour" in prompt
+        assert "Division Of Labour" in prompt
+        assert "Production" in prompt
+        assert "Splitting work" in prompt
+
+    def test_contains_topic(self):
+        prompt = build_evaluation_prompt(_entity(), "WoN")
+        assert "WoN" in prompt
+
+    def test_contains_dimensions(self):
+        prompt = build_evaluation_prompt(_entity(), "T")
+        assert "definition_precision" in prompt
+        assert "source_grounding" in prompt
+
+    def test_custom_dimensions(self):
+        prompt = build_evaluation_prompt(
+            _entity(), "T", dimensions=["novelty", "coherence"]
+        )
+        assert "novelty" in prompt
+        assert "coherence" in prompt
+        assert "definition_precision" not in prompt
+
+    def test_handles_missing_fields(self):
+        entity = _entity(definition="", context="", domain="")
+        prompt = build_evaluation_prompt(entity, "T")
+        assert "(no definition)" in prompt
+        assert "(no context)" in prompt
+        assert "(unspecified)" in prompt
+
+
+# ── content_digest ───────────────────────────────────────────────────
+
+
+class TestContentDigest:
+    def test_deterministic(self):
+        e = _entity()
+        assert content_digest(e) == content_digest(e)
+
+    def test_changes_with_content(self):
+        e1 = _entity(definition="A")
+        e2 = _entity(definition="B")
+        assert content_digest(e1) != content_digest(e2)
+
+
+# ── parse_evaluation_response ────────────────────────────────────────
+
+
+class TestParseResponse:
+    def test_parses_three_dimensions(self):
+        scores = parse_evaluation_response(_MOCK_RESPONSE)
+        assert len(scores) == 3
+
+    def test_correct_names(self):
+        scores = parse_evaluation_response(_MOCK_RESPONSE)
+        names = [s.name for s in scores]
+        assert "definition_precision" in names
+        assert "source_grounding" in names
+        assert "domain_relevance" in names
+
+    def test_correct_scores(self):
+        scores = parse_evaluation_response(_MOCK_RESPONSE)
+        by_name = {s.name: s for s in scores}
+        assert by_name["definition_precision"].value == 4.5
+        assert by_name["source_grounding"].value == 4.0
+        assert by_name["domain_relevance"].value == 5.0
+
+    def test_correct_rationales(self):
+        scores = parse_evaluation_response(_MOCK_RESPONSE)
+        by_name = {s.name: s for s in scores}
+        assert "Clear" in by_name["definition_precision"].rationale
+
+    def test_empty_response(self):
+        scores = parse_evaluation_response("")
+        assert scores == []
+
+    def test_malformed_score_skipped(self):
+        text = "DIMENSION: x\nSCORE: not-a-number\nRATIONALE: oops"
+        scores = parse_evaluation_response(text)
+        assert len(scores) == 0
+
+
+# ── run_entity_evaluation ────────────────────────────────────────────
+
+
+class TestRunEntityEvaluation:
+    def test_evaluates_entities(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        entities = [_entity(), _entity(slug="pin-factory", title="Pin Factory")]
+
+        summary = run_entity_evaluation(
+            config=cfg,
+            entities=entities,
+            adapter=adapter,
+            output_dir=tmp_path / "evals",
+        )
+        assert summary.total == 2
+        assert summary.succeeded == 2
+        assert adapter.call_count == 2
+
+    def test_writes_evaluation_files(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        entities = [_entity()]
+
+        run_entity_evaluation(
+            config=cfg,
+            entities=entities,
+            adapter=adapter,
+            output_dir=tmp_path / "evals",
+        )
+        eval_file = tmp_path / "evals" / "division-of-labour.md"
+        assert eval_file.exists()
+        text = eval_file.read_text()
+        assert "definition_precision" in text
+
+    def test_incremental_skip(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        entity = _entity()
+        digest = content_digest(entity)
+
+        summary = run_entity_evaluation(
+            config=cfg,
+            entities=[entity],
+            adapter=adapter,
+            output_dir=tmp_path,
+            previous_digests={entity.slug: digest},
+        )
+        assert summary.skipped == 1
+        assert adapter.call_count == 0
+
+    def test_progress_callback_called(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        calls = []
+
+        run_entity_evaluation(
+            config=cfg,
+            entities=[_entity()],
+            adapter=adapter,
+            output_dir=tmp_path,
+            progress_callback=lambda d, t, r: calls.append((d, t, r.key)),
+        )
+        assert len(calls) == 1
+        assert calls[0] == (1, 1, "division-of-labour")
+
+    def test_passes_run_config(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        rc = RunConfig(temperature=0.1, max_tokens=500)
+
+        run_entity_evaluation(
+            config=cfg,
+            entities=[_entity()],
+            adapter=adapter,
+            run_config=rc,
+            output_dir=tmp_path,
+        )
+        assert adapter.last_config.temperature == 0.1