Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot, SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files, snapshot persistence, history append, and snapshot diffing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
399 lines
14 KiB
Python
399 lines
14 KiB
Python
"""Tests for markitect.infospace evaluation models and I/O."""
|
|
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
|
|
from markitect.infospace import (
|
|
EntityEvaluation,
|
|
EvaluationSnapshot,
|
|
MetricChange,
|
|
MetricValue,
|
|
ScoreChange,
|
|
ScoreEntry,
|
|
SnapshotDiff,
|
|
append_to_history,
|
|
diff_snapshots,
|
|
read_entity_evaluation,
|
|
read_history,
|
|
read_snapshot,
|
|
write_entity_evaluation,
|
|
write_snapshot,
|
|
)
|
|
|
|
|
|
# ── Helpers ──────────────────────────────────────────────────────────
|
|
|
|
_NOW = datetime(2026, 2, 19, 12, 0, 0)
|
|
|
|
|
|
def _sample_scores() -> list:
|
|
return [
|
|
ScoreEntry("definition_precision", 4.5, rationale="Clear and specific."),
|
|
ScoreEntry("source_grounding", 4.0, rationale="Well grounded."),
|
|
ScoreEntry("domain_relevance", 4.5),
|
|
]
|
|
|
|
|
|
def _sample_evaluation(**overrides) -> EntityEvaluation:
|
|
defaults = dict(
|
|
entity_slug="division-of-labour",
|
|
evaluator="openrouter/anthropic/claude-3.5-sonnet",
|
|
scores=_sample_scores(),
|
|
evaluated_at=_NOW,
|
|
notes=["Strong entity with clear provenance"],
|
|
)
|
|
defaults.update(overrides)
|
|
return EntityEvaluation(**defaults)
|
|
|
|
|
|
def _sample_metric() -> MetricValue:
|
|
return MetricValue("coverage_ratio", 0.85, concern="C2", details={"checked": 85})
|
|
|
|
|
|
def _sample_snapshot(**overrides) -> EvaluationSnapshot:
|
|
defaults = dict(
|
|
snapshot_id="2026-02-19",
|
|
created_at=_NOW,
|
|
schema_name="Economic Entity",
|
|
entity_count=1,
|
|
entity_evaluations=[_sample_evaluation()],
|
|
collection_metrics=[_sample_metric()],
|
|
metadata={"version": "1.0"},
|
|
)
|
|
defaults.update(overrides)
|
|
return EvaluationSnapshot(**defaults)
|
|
|
|
|
|
# ── Model tests ──────────────────────────────────────────────────────
|
|
|
|
|
|
class TestScoreEntry:
|
|
def test_to_dict_from_dict_round_trip(self):
|
|
se = ScoreEntry("precision", 4.5, 5.0, "Good definition.")
|
|
d = se.to_dict()
|
|
restored = ScoreEntry.from_dict(d)
|
|
assert restored.name == se.name
|
|
assert restored.value == se.value
|
|
assert restored.max_value == se.max_value
|
|
assert restored.rationale == se.rationale
|
|
|
|
def test_to_dict_omits_empty_rationale(self):
|
|
se = ScoreEntry("precision", 4.5)
|
|
d = se.to_dict()
|
|
assert "rationale" not in d
|
|
|
|
def test_from_dict_defaults(self):
|
|
se = ScoreEntry.from_dict({"name": "x", "value": 3.0})
|
|
assert se.max_value == 5.0
|
|
assert se.rationale == ""
|
|
|
|
|
|
class TestEntityEvaluation:
|
|
def test_overall_score_is_mean(self):
|
|
ev = _sample_evaluation()
|
|
# (4.5 + 4.0 + 4.5) / 3 ≈ 4.333
|
|
assert abs(ev.overall_score - 4.333333) < 0.001
|
|
|
|
def test_overall_score_zero_scores(self):
|
|
ev = _sample_evaluation(scores=[])
|
|
assert ev.overall_score == 0.0
|
|
|
|
def test_to_dict_from_dict_round_trip(self):
|
|
ev = _sample_evaluation()
|
|
d = ev.to_dict()
|
|
restored = EntityEvaluation.from_dict(d)
|
|
assert restored.entity_slug == ev.entity_slug
|
|
assert restored.evaluator == ev.evaluator
|
|
assert len(restored.scores) == len(ev.scores)
|
|
assert restored.evaluated_at == ev.evaluated_at
|
|
assert restored.notes == ev.notes
|
|
|
|
def test_to_dict_includes_overall_score(self):
|
|
ev = _sample_evaluation()
|
|
d = ev.to_dict()
|
|
assert "overall_score" in d
|
|
assert abs(d["overall_score"] - 4.3333) < 0.01
|
|
|
|
|
|
class TestMetricValue:
|
|
def test_to_dict_from_dict_round_trip(self):
|
|
mv = _sample_metric()
|
|
d = mv.to_dict()
|
|
restored = MetricValue.from_dict(d)
|
|
assert restored.name == mv.name
|
|
assert restored.value == mv.value
|
|
assert restored.concern == mv.concern
|
|
assert restored.details == mv.details
|
|
|
|
def test_to_dict_omits_empty_concern(self):
|
|
mv = MetricValue("x", 1.0)
|
|
d = mv.to_dict()
|
|
assert "concern" not in d
|
|
assert "details" not in d
|
|
|
|
|
|
class TestEvaluationSnapshot:
|
|
def test_to_dict_from_dict_round_trip(self):
|
|
snap = _sample_snapshot()
|
|
d = snap.to_dict()
|
|
restored = EvaluationSnapshot.from_dict(d)
|
|
assert restored.snapshot_id == snap.snapshot_id
|
|
assert restored.created_at == snap.created_at
|
|
assert restored.schema_name == snap.schema_name
|
|
assert restored.entity_count == snap.entity_count
|
|
assert len(restored.entity_evaluations) == 1
|
|
assert len(restored.collection_metrics) == 1
|
|
assert restored.metadata == snap.metadata
|
|
|
|
def test_from_dict_empty_lists(self):
|
|
d = {
|
|
"snapshot_id": "test",
|
|
"created_at": _NOW.isoformat(),
|
|
"schema_name": "Test",
|
|
"entity_count": 0,
|
|
}
|
|
snap = EvaluationSnapshot.from_dict(d)
|
|
assert snap.entity_evaluations == []
|
|
assert snap.collection_metrics == []
|
|
assert snap.metadata == {}
|
|
|
|
|
|
# ── Per-entity file I/O ──────────────────────────────────────────────
|
|
|
|
|
|
class TestEntityEvaluationIO:
|
|
def test_write_creates_file(self, tmp_path):
|
|
ev = _sample_evaluation()
|
|
p = tmp_path / "eval.md"
|
|
write_entity_evaluation(ev, p)
|
|
assert p.exists()
|
|
|
|
def test_file_has_yaml_frontmatter(self, tmp_path):
|
|
ev = _sample_evaluation()
|
|
p = tmp_path / "eval.md"
|
|
write_entity_evaluation(ev, p)
|
|
text = p.read_text()
|
|
assert text.startswith("---\n")
|
|
assert "\n---\n" in text
|
|
|
|
def test_frontmatter_contains_expected_keys(self, tmp_path):
|
|
ev = _sample_evaluation()
|
|
p = tmp_path / "eval.md"
|
|
write_entity_evaluation(ev, p)
|
|
text = p.read_text()
|
|
for key in ["entity_slug", "evaluator", "evaluated_at", "overall_score", "scores"]:
|
|
assert key in text
|
|
|
|
def test_markdown_body_contains_rationales(self, tmp_path):
|
|
ev = _sample_evaluation()
|
|
p = tmp_path / "eval.md"
|
|
write_entity_evaluation(ev, p)
|
|
text = p.read_text()
|
|
assert "Clear and specific." in text
|
|
assert "Well grounded." in text
|
|
assert "## definition_precision" in text
|
|
|
|
def test_read_back_matches_original(self, tmp_path):
|
|
ev = _sample_evaluation()
|
|
p = tmp_path / "eval.md"
|
|
write_entity_evaluation(ev, p)
|
|
restored = read_entity_evaluation(p)
|
|
assert restored.entity_slug == ev.entity_slug
|
|
assert restored.evaluator == ev.evaluator
|
|
assert restored.evaluated_at == ev.evaluated_at
|
|
assert restored.notes == ev.notes
|
|
assert len(restored.scores) == len(ev.scores)
|
|
|
|
def test_round_trip_preserves_scores(self, tmp_path):
|
|
ev = _sample_evaluation()
|
|
p = tmp_path / "eval.md"
|
|
write_entity_evaluation(ev, p)
|
|
restored = read_entity_evaluation(p)
|
|
for orig, rest in zip(ev.scores, restored.scores):
|
|
assert rest.name == orig.name
|
|
assert rest.value == orig.value
|
|
assert rest.max_value == orig.max_value
|
|
|
|
def test_round_trip_preserves_rationales(self, tmp_path):
|
|
ev = _sample_evaluation()
|
|
p = tmp_path / "eval.md"
|
|
write_entity_evaluation(ev, p)
|
|
restored = read_entity_evaluation(p)
|
|
assert restored.scores[0].rationale == "Clear and specific."
|
|
assert restored.scores[1].rationale == "Well grounded."
|
|
# Third score has no rationale
|
|
assert restored.scores[2].rationale == ""
|
|
|
|
def test_write_creates_parent_dirs(self, tmp_path):
|
|
ev = _sample_evaluation()
|
|
p = tmp_path / "deep" / "nested" / "eval.md"
|
|
write_entity_evaluation(ev, p)
|
|
assert p.exists()
|
|
|
|
|
|
# ── Snapshot I/O ─────────────────────────────────────────────────────
|
|
|
|
|
|
class TestSnapshotIO:
|
|
def test_write_creates_file(self, tmp_path):
|
|
snap = _sample_snapshot()
|
|
p = tmp_path / "snapshot.yaml"
|
|
write_snapshot(snap, p)
|
|
assert p.exists()
|
|
|
|
def test_read_back_matches_original(self, tmp_path):
|
|
snap = _sample_snapshot()
|
|
p = tmp_path / "snapshot.yaml"
|
|
write_snapshot(snap, p)
|
|
restored = read_snapshot(p)
|
|
assert restored.snapshot_id == snap.snapshot_id
|
|
assert restored.created_at == snap.created_at
|
|
assert restored.schema_name == snap.schema_name
|
|
assert restored.entity_count == snap.entity_count
|
|
|
|
def test_round_trip_preserves_entity_evaluations(self, tmp_path):
|
|
snap = _sample_snapshot()
|
|
p = tmp_path / "snapshot.yaml"
|
|
write_snapshot(snap, p)
|
|
restored = read_snapshot(p)
|
|
assert len(restored.entity_evaluations) == 1
|
|
ev = restored.entity_evaluations[0]
|
|
assert ev.entity_slug == "division-of-labour"
|
|
assert len(ev.scores) == 3
|
|
|
|
def test_round_trip_preserves_collection_metrics(self, tmp_path):
|
|
snap = _sample_snapshot()
|
|
p = tmp_path / "snapshot.yaml"
|
|
write_snapshot(snap, p)
|
|
restored = read_snapshot(p)
|
|
assert len(restored.collection_metrics) == 1
|
|
m = restored.collection_metrics[0]
|
|
assert m.name == "coverage_ratio"
|
|
assert m.value == 0.85
|
|
assert m.concern == "C2"
|
|
|
|
|
|
# ── History ──────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestHistory:
|
|
def test_append_creates_new_file(self, tmp_path):
|
|
snap = _sample_snapshot()
|
|
hp = tmp_path / "history.yaml"
|
|
append_to_history(snap, hp)
|
|
assert hp.exists()
|
|
history = read_history(hp)
|
|
assert len(history) == 1
|
|
|
|
def test_append_adds_to_existing(self, tmp_path):
|
|
hp = tmp_path / "history.yaml"
|
|
snap1 = _sample_snapshot(snapshot_id="snap-1")
|
|
snap2 = _sample_snapshot(snapshot_id="snap-2")
|
|
append_to_history(snap1, hp)
|
|
append_to_history(snap2, hp)
|
|
history = read_history(hp)
|
|
assert len(history) == 2
|
|
assert history[0].snapshot_id == "snap-1"
|
|
assert history[1].snapshot_id == "snap-2"
|
|
|
|
def test_multiple_appends_all_preserved(self, tmp_path):
|
|
hp = tmp_path / "history.yaml"
|
|
for i in range(5):
|
|
snap = _sample_snapshot(snapshot_id=f"snap-{i}")
|
|
append_to_history(snap, hp)
|
|
history = read_history(hp)
|
|
assert len(history) == 5
|
|
assert [h.snapshot_id for h in history] == [f"snap-{i}" for i in range(5)]
|
|
|
|
def test_read_history_returns_list_in_order(self, tmp_path):
|
|
hp = tmp_path / "history.yaml"
|
|
snap_a = _sample_snapshot(snapshot_id="a")
|
|
snap_b = _sample_snapshot(snapshot_id="b")
|
|
append_to_history(snap_a, hp)
|
|
append_to_history(snap_b, hp)
|
|
history = read_history(hp)
|
|
assert history[0].snapshot_id == "a"
|
|
assert history[1].snapshot_id == "b"
|
|
|
|
|
|
# ── Diffing ──────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestDiffSnapshots:
|
|
def test_identical_snapshots_empty_diff(self):
|
|
snap = _sample_snapshot()
|
|
diff = diff_snapshots(snap, snap)
|
|
assert diff.added_entities == []
|
|
assert diff.removed_entities == []
|
|
assert diff.score_changes == []
|
|
assert diff.metric_changes == []
|
|
|
|
def test_added_entity(self):
|
|
before = _sample_snapshot(entity_evaluations=[])
|
|
after = _sample_snapshot()
|
|
diff = diff_snapshots(before, after)
|
|
assert "division-of-labour" in diff.added_entities
|
|
assert diff.removed_entities == []
|
|
|
|
def test_removed_entity(self):
|
|
before = _sample_snapshot()
|
|
after = _sample_snapshot(entity_evaluations=[])
|
|
diff = diff_snapshots(before, after)
|
|
assert "division-of-labour" in diff.removed_entities
|
|
assert diff.added_entities == []
|
|
|
|
def test_changed_score(self):
|
|
ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
|
|
ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
|
|
before = _sample_snapshot(entity_evaluations=[ev_before])
|
|
after = _sample_snapshot(entity_evaluations=[ev_after])
|
|
diff = diff_snapshots(before, after)
|
|
assert len(diff.score_changes) == 1
|
|
sc = diff.score_changes[0]
|
|
assert sc.entity_slug == "division-of-labour"
|
|
assert sc.dimension == "precision"
|
|
assert sc.before == 4.0
|
|
assert sc.after == 4.8
|
|
|
|
def test_changed_metric(self):
|
|
before = _sample_snapshot(
|
|
collection_metrics=[MetricValue("coverage_ratio", 0.80)]
|
|
)
|
|
after = _sample_snapshot(
|
|
collection_metrics=[MetricValue("coverage_ratio", 0.90)]
|
|
)
|
|
diff = diff_snapshots(before, after)
|
|
assert len(diff.metric_changes) == 1
|
|
mc = diff.metric_changes[0]
|
|
assert mc.name == "coverage_ratio"
|
|
assert mc.before == 0.80
|
|
assert mc.after == 0.90
|
|
|
|
def test_summary_readable(self):
|
|
ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
|
|
ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
|
|
before = _sample_snapshot(
|
|
snapshot_id="snap-1",
|
|
entity_evaluations=[ev_before],
|
|
collection_metrics=[MetricValue("coverage", 0.80)],
|
|
)
|
|
after = _sample_snapshot(
|
|
snapshot_id="snap-2",
|
|
entity_evaluations=[ev_after],
|
|
collection_metrics=[MetricValue("coverage", 0.90)],
|
|
)
|
|
diff = diff_snapshots(before, after)
|
|
text = diff.summary()
|
|
assert "snap-1" in text
|
|
assert "snap-2" in text
|
|
assert "precision" in text
|
|
assert "coverage" in text
|
|
|
|
def test_summary_no_changes(self):
|
|
snap = _sample_snapshot()
|
|
diff = diff_snapshots(snap, snap)
|
|
text = diff.summary()
|
|
assert "No changes" in text
|