Files
markitect-main/tests/unit/infospace/test_evaluation.py
tegwick f8c9ab33f0 feat(infospace): add structured evaluation output with history and diffing (S1.5)
Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot,
SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files,
snapshot persistence, history append, and snapshot diffing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:35:22 +01:00

399 lines
14 KiB
Python

"""Tests for markitect.infospace evaluation models and I/O."""
from datetime import datetime
from pathlib import Path
import pytest
from markitect.infospace import (
EntityEvaluation,
EvaluationSnapshot,
MetricChange,
MetricValue,
ScoreChange,
ScoreEntry,
SnapshotDiff,
append_to_history,
diff_snapshots,
read_entity_evaluation,
read_history,
read_snapshot,
write_entity_evaluation,
write_snapshot,
)
# ── Helpers ──────────────────────────────────────────────────────────
_NOW = datetime(2026, 2, 19, 12, 0, 0)
def _sample_scores() -> list:
return [
ScoreEntry("definition_precision", 4.5, rationale="Clear and specific."),
ScoreEntry("source_grounding", 4.0, rationale="Well grounded."),
ScoreEntry("domain_relevance", 4.5),
]
def _sample_evaluation(**overrides) -> EntityEvaluation:
defaults = dict(
entity_slug="division-of-labour",
evaluator="openrouter/anthropic/claude-3.5-sonnet",
scores=_sample_scores(),
evaluated_at=_NOW,
notes=["Strong entity with clear provenance"],
)
defaults.update(overrides)
return EntityEvaluation(**defaults)
def _sample_metric() -> MetricValue:
return MetricValue("coverage_ratio", 0.85, concern="C2", details={"checked": 85})
def _sample_snapshot(**overrides) -> EvaluationSnapshot:
defaults = dict(
snapshot_id="2026-02-19",
created_at=_NOW,
schema_name="Economic Entity",
entity_count=1,
entity_evaluations=[_sample_evaluation()],
collection_metrics=[_sample_metric()],
metadata={"version": "1.0"},
)
defaults.update(overrides)
return EvaluationSnapshot(**defaults)
# ── Model tests ──────────────────────────────────────────────────────
class TestScoreEntry:
def test_to_dict_from_dict_round_trip(self):
se = ScoreEntry("precision", 4.5, 5.0, "Good definition.")
d = se.to_dict()
restored = ScoreEntry.from_dict(d)
assert restored.name == se.name
assert restored.value == se.value
assert restored.max_value == se.max_value
assert restored.rationale == se.rationale
def test_to_dict_omits_empty_rationale(self):
se = ScoreEntry("precision", 4.5)
d = se.to_dict()
assert "rationale" not in d
def test_from_dict_defaults(self):
se = ScoreEntry.from_dict({"name": "x", "value": 3.0})
assert se.max_value == 5.0
assert se.rationale == ""
class TestEntityEvaluation:
def test_overall_score_is_mean(self):
ev = _sample_evaluation()
# (4.5 + 4.0 + 4.5) / 3 ≈ 4.333
assert abs(ev.overall_score - 4.333333) < 0.001
def test_overall_score_zero_scores(self):
ev = _sample_evaluation(scores=[])
assert ev.overall_score == 0.0
def test_to_dict_from_dict_round_trip(self):
ev = _sample_evaluation()
d = ev.to_dict()
restored = EntityEvaluation.from_dict(d)
assert restored.entity_slug == ev.entity_slug
assert restored.evaluator == ev.evaluator
assert len(restored.scores) == len(ev.scores)
assert restored.evaluated_at == ev.evaluated_at
assert restored.notes == ev.notes
def test_to_dict_includes_overall_score(self):
ev = _sample_evaluation()
d = ev.to_dict()
assert "overall_score" in d
assert abs(d["overall_score"] - 4.3333) < 0.01
class TestMetricValue:
def test_to_dict_from_dict_round_trip(self):
mv = _sample_metric()
d = mv.to_dict()
restored = MetricValue.from_dict(d)
assert restored.name == mv.name
assert restored.value == mv.value
assert restored.concern == mv.concern
assert restored.details == mv.details
def test_to_dict_omits_empty_concern(self):
mv = MetricValue("x", 1.0)
d = mv.to_dict()
assert "concern" not in d
assert "details" not in d
class TestEvaluationSnapshot:
def test_to_dict_from_dict_round_trip(self):
snap = _sample_snapshot()
d = snap.to_dict()
restored = EvaluationSnapshot.from_dict(d)
assert restored.snapshot_id == snap.snapshot_id
assert restored.created_at == snap.created_at
assert restored.schema_name == snap.schema_name
assert restored.entity_count == snap.entity_count
assert len(restored.entity_evaluations) == 1
assert len(restored.collection_metrics) == 1
assert restored.metadata == snap.metadata
def test_from_dict_empty_lists(self):
d = {
"snapshot_id": "test",
"created_at": _NOW.isoformat(),
"schema_name": "Test",
"entity_count": 0,
}
snap = EvaluationSnapshot.from_dict(d)
assert snap.entity_evaluations == []
assert snap.collection_metrics == []
assert snap.metadata == {}
# ── Per-entity file I/O ──────────────────────────────────────────────
class TestEntityEvaluationIO:
def test_write_creates_file(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
assert p.exists()
def test_file_has_yaml_frontmatter(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
text = p.read_text()
assert text.startswith("---\n")
assert "\n---\n" in text
def test_frontmatter_contains_expected_keys(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
text = p.read_text()
for key in ["entity_slug", "evaluator", "evaluated_at", "overall_score", "scores"]:
assert key in text
def test_markdown_body_contains_rationales(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
text = p.read_text()
assert "Clear and specific." in text
assert "Well grounded." in text
assert "## definition_precision" in text
def test_read_back_matches_original(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
restored = read_entity_evaluation(p)
assert restored.entity_slug == ev.entity_slug
assert restored.evaluator == ev.evaluator
assert restored.evaluated_at == ev.evaluated_at
assert restored.notes == ev.notes
assert len(restored.scores) == len(ev.scores)
def test_round_trip_preserves_scores(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
restored = read_entity_evaluation(p)
for orig, rest in zip(ev.scores, restored.scores):
assert rest.name == orig.name
assert rest.value == orig.value
assert rest.max_value == orig.max_value
def test_round_trip_preserves_rationales(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
restored = read_entity_evaluation(p)
assert restored.scores[0].rationale == "Clear and specific."
assert restored.scores[1].rationale == "Well grounded."
# Third score has no rationale
assert restored.scores[2].rationale == ""
def test_write_creates_parent_dirs(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "deep" / "nested" / "eval.md"
write_entity_evaluation(ev, p)
assert p.exists()
# ── Snapshot I/O ─────────────────────────────────────────────────────
class TestSnapshotIO:
def test_write_creates_file(self, tmp_path):
snap = _sample_snapshot()
p = tmp_path / "snapshot.yaml"
write_snapshot(snap, p)
assert p.exists()
def test_read_back_matches_original(self, tmp_path):
snap = _sample_snapshot()
p = tmp_path / "snapshot.yaml"
write_snapshot(snap, p)
restored = read_snapshot(p)
assert restored.snapshot_id == snap.snapshot_id
assert restored.created_at == snap.created_at
assert restored.schema_name == snap.schema_name
assert restored.entity_count == snap.entity_count
def test_round_trip_preserves_entity_evaluations(self, tmp_path):
snap = _sample_snapshot()
p = tmp_path / "snapshot.yaml"
write_snapshot(snap, p)
restored = read_snapshot(p)
assert len(restored.entity_evaluations) == 1
ev = restored.entity_evaluations[0]
assert ev.entity_slug == "division-of-labour"
assert len(ev.scores) == 3
def test_round_trip_preserves_collection_metrics(self, tmp_path):
snap = _sample_snapshot()
p = tmp_path / "snapshot.yaml"
write_snapshot(snap, p)
restored = read_snapshot(p)
assert len(restored.collection_metrics) == 1
m = restored.collection_metrics[0]
assert m.name == "coverage_ratio"
assert m.value == 0.85
assert m.concern == "C2"
# ── History ──────────────────────────────────────────────────────────
class TestHistory:
def test_append_creates_new_file(self, tmp_path):
snap = _sample_snapshot()
hp = tmp_path / "history.yaml"
append_to_history(snap, hp)
assert hp.exists()
history = read_history(hp)
assert len(history) == 1
def test_append_adds_to_existing(self, tmp_path):
hp = tmp_path / "history.yaml"
snap1 = _sample_snapshot(snapshot_id="snap-1")
snap2 = _sample_snapshot(snapshot_id="snap-2")
append_to_history(snap1, hp)
append_to_history(snap2, hp)
history = read_history(hp)
assert len(history) == 2
assert history[0].snapshot_id == "snap-1"
assert history[1].snapshot_id == "snap-2"
def test_multiple_appends_all_preserved(self, tmp_path):
hp = tmp_path / "history.yaml"
for i in range(5):
snap = _sample_snapshot(snapshot_id=f"snap-{i}")
append_to_history(snap, hp)
history = read_history(hp)
assert len(history) == 5
assert [h.snapshot_id for h in history] == [f"snap-{i}" for i in range(5)]
def test_read_history_returns_list_in_order(self, tmp_path):
hp = tmp_path / "history.yaml"
snap_a = _sample_snapshot(snapshot_id="a")
snap_b = _sample_snapshot(snapshot_id="b")
append_to_history(snap_a, hp)
append_to_history(snap_b, hp)
history = read_history(hp)
assert history[0].snapshot_id == "a"
assert history[1].snapshot_id == "b"
# ── Diffing ──────────────────────────────────────────────────────────
class TestDiffSnapshots:
def test_identical_snapshots_empty_diff(self):
snap = _sample_snapshot()
diff = diff_snapshots(snap, snap)
assert diff.added_entities == []
assert diff.removed_entities == []
assert diff.score_changes == []
assert diff.metric_changes == []
def test_added_entity(self):
before = _sample_snapshot(entity_evaluations=[])
after = _sample_snapshot()
diff = diff_snapshots(before, after)
assert "division-of-labour" in diff.added_entities
assert diff.removed_entities == []
def test_removed_entity(self):
before = _sample_snapshot()
after = _sample_snapshot(entity_evaluations=[])
diff = diff_snapshots(before, after)
assert "division-of-labour" in diff.removed_entities
assert diff.added_entities == []
def test_changed_score(self):
ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
before = _sample_snapshot(entity_evaluations=[ev_before])
after = _sample_snapshot(entity_evaluations=[ev_after])
diff = diff_snapshots(before, after)
assert len(diff.score_changes) == 1
sc = diff.score_changes[0]
assert sc.entity_slug == "division-of-labour"
assert sc.dimension == "precision"
assert sc.before == 4.0
assert sc.after == 4.8
def test_changed_metric(self):
before = _sample_snapshot(
collection_metrics=[MetricValue("coverage_ratio", 0.80)]
)
after = _sample_snapshot(
collection_metrics=[MetricValue("coverage_ratio", 0.90)]
)
diff = diff_snapshots(before, after)
assert len(diff.metric_changes) == 1
mc = diff.metric_changes[0]
assert mc.name == "coverage_ratio"
assert mc.before == 0.80
assert mc.after == 0.90
def test_summary_readable(self):
ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
before = _sample_snapshot(
snapshot_id="snap-1",
entity_evaluations=[ev_before],
collection_metrics=[MetricValue("coverage", 0.80)],
)
after = _sample_snapshot(
snapshot_id="snap-2",
entity_evaluations=[ev_after],
collection_metrics=[MetricValue("coverage", 0.90)],
)
diff = diff_snapshots(before, after)
text = diff.summary()
assert "snap-1" in text
assert "snap-2" in text
assert "precision" in text
assert "coverage" in text
def test_summary_no_changes(self):
snap = _sample_snapshot()
diff = diff_snapshots(snap, snap)
text = diff.summary()
assert "No changes" in text