"""Tests for markitect.infospace evaluation models and I/O.""" from datetime import datetime from pathlib import Path import pytest from markitect.infospace import ( EntityEvaluation, EvaluationSnapshot, MetricChange, MetricValue, ScoreChange, ScoreEntry, SnapshotDiff, append_to_history, diff_snapshots, read_entity_evaluation, read_history, read_snapshot, write_entity_evaluation, write_snapshot, ) # ── Helpers ────────────────────────────────────────────────────────── _NOW = datetime(2026, 2, 19, 12, 0, 0) def _sample_scores() -> list: return [ ScoreEntry("definition_precision", 4.5, rationale="Clear and specific."), ScoreEntry("source_grounding", 4.0, rationale="Well grounded."), ScoreEntry("domain_relevance", 4.5), ] def _sample_evaluation(**overrides) -> EntityEvaluation: defaults = dict( entity_slug="division-of-labour", evaluator="openrouter/anthropic/claude-3.5-sonnet", scores=_sample_scores(), evaluated_at=_NOW, notes=["Strong entity with clear provenance"], ) defaults.update(overrides) return EntityEvaluation(**defaults) def _sample_metric() -> MetricValue: return MetricValue("coverage_ratio", 0.85, concern="C2", details={"checked": 85}) def _sample_snapshot(**overrides) -> EvaluationSnapshot: defaults = dict( snapshot_id="2026-02-19", created_at=_NOW, schema_name="Economic Entity", entity_count=1, entity_evaluations=[_sample_evaluation()], collection_metrics=[_sample_metric()], metadata={"version": "1.0"}, ) defaults.update(overrides) return EvaluationSnapshot(**defaults) # ── Model tests ────────────────────────────────────────────────────── class TestScoreEntry: def test_to_dict_from_dict_round_trip(self): se = ScoreEntry("precision", 4.5, 5.0, "Good definition.") d = se.to_dict() restored = ScoreEntry.from_dict(d) assert restored.name == se.name assert restored.value == se.value assert restored.max_value == se.max_value assert restored.rationale == se.rationale def test_to_dict_omits_empty_rationale(self): se = ScoreEntry("precision", 4.5) d = se.to_dict() assert "rationale" not in d def test_from_dict_defaults(self): se = ScoreEntry.from_dict({"name": "x", "value": 3.0}) assert se.max_value == 5.0 assert se.rationale == "" class TestEntityEvaluation: def test_overall_score_is_mean(self): ev = _sample_evaluation() # (4.5 + 4.0 + 4.5) / 3 ≈ 4.333 assert abs(ev.overall_score - 4.333333) < 0.001 def test_overall_score_zero_scores(self): ev = _sample_evaluation(scores=[]) assert ev.overall_score == 0.0 def test_to_dict_from_dict_round_trip(self): ev = _sample_evaluation() d = ev.to_dict() restored = EntityEvaluation.from_dict(d) assert restored.entity_slug == ev.entity_slug assert restored.evaluator == ev.evaluator assert len(restored.scores) == len(ev.scores) assert restored.evaluated_at == ev.evaluated_at assert restored.notes == ev.notes def test_to_dict_includes_overall_score(self): ev = _sample_evaluation() d = ev.to_dict() assert "overall_score" in d assert abs(d["overall_score"] - 4.3333) < 0.01 class TestMetricValue: def test_to_dict_from_dict_round_trip(self): mv = _sample_metric() d = mv.to_dict() restored = MetricValue.from_dict(d) assert restored.name == mv.name assert restored.value == mv.value assert restored.concern == mv.concern assert restored.details == mv.details def test_to_dict_omits_empty_concern(self): mv = MetricValue("x", 1.0) d = mv.to_dict() assert "concern" not in d assert "details" not in d class TestEvaluationSnapshot: def test_to_dict_from_dict_round_trip(self): snap = _sample_snapshot() d = snap.to_dict() restored = EvaluationSnapshot.from_dict(d) assert restored.snapshot_id == snap.snapshot_id assert restored.created_at == snap.created_at assert restored.schema_name == snap.schema_name assert restored.entity_count == snap.entity_count assert len(restored.entity_evaluations) == 1 assert len(restored.collection_metrics) == 1 assert restored.metadata == snap.metadata def test_from_dict_empty_lists(self): d = { "snapshot_id": "test", "created_at": _NOW.isoformat(), "schema_name": "Test", "entity_count": 0, } snap = EvaluationSnapshot.from_dict(d) assert snap.entity_evaluations == [] assert snap.collection_metrics == [] assert snap.metadata == {} # ── Per-entity file I/O ────────────────────────────────────────────── class TestEntityEvaluationIO: def test_write_creates_file(self, tmp_path): ev = _sample_evaluation() p = tmp_path / "eval.md" write_entity_evaluation(ev, p) assert p.exists() def test_file_has_yaml_frontmatter(self, tmp_path): ev = _sample_evaluation() p = tmp_path / "eval.md" write_entity_evaluation(ev, p) text = p.read_text() assert text.startswith("---\n") assert "\n---\n" in text def test_frontmatter_contains_expected_keys(self, tmp_path): ev = _sample_evaluation() p = tmp_path / "eval.md" write_entity_evaluation(ev, p) text = p.read_text() for key in ["entity_slug", "evaluator", "evaluated_at", "overall_score", "scores"]: assert key in text def test_markdown_body_contains_rationales(self, tmp_path): ev = _sample_evaluation() p = tmp_path / "eval.md" write_entity_evaluation(ev, p) text = p.read_text() assert "Clear and specific." in text assert "Well grounded." in text assert "## definition_precision" in text def test_read_back_matches_original(self, tmp_path): ev = _sample_evaluation() p = tmp_path / "eval.md" write_entity_evaluation(ev, p) restored = read_entity_evaluation(p) assert restored.entity_slug == ev.entity_slug assert restored.evaluator == ev.evaluator assert restored.evaluated_at == ev.evaluated_at assert restored.notes == ev.notes assert len(restored.scores) == len(ev.scores) def test_round_trip_preserves_scores(self, tmp_path): ev = _sample_evaluation() p = tmp_path / "eval.md" write_entity_evaluation(ev, p) restored = read_entity_evaluation(p) for orig, rest in zip(ev.scores, restored.scores): assert rest.name == orig.name assert rest.value == orig.value assert rest.max_value == orig.max_value def test_round_trip_preserves_rationales(self, tmp_path): ev = _sample_evaluation() p = tmp_path / "eval.md" write_entity_evaluation(ev, p) restored = read_entity_evaluation(p) assert restored.scores[0].rationale == "Clear and specific." assert restored.scores[1].rationale == "Well grounded." # Third score has no rationale assert restored.scores[2].rationale == "" def test_write_creates_parent_dirs(self, tmp_path): ev = _sample_evaluation() p = tmp_path / "deep" / "nested" / "eval.md" write_entity_evaluation(ev, p) assert p.exists() # ── Snapshot I/O ───────────────────────────────────────────────────── class TestSnapshotIO: def test_write_creates_file(self, tmp_path): snap = _sample_snapshot() p = tmp_path / "snapshot.yaml" write_snapshot(snap, p) assert p.exists() def test_read_back_matches_original(self, tmp_path): snap = _sample_snapshot() p = tmp_path / "snapshot.yaml" write_snapshot(snap, p) restored = read_snapshot(p) assert restored.snapshot_id == snap.snapshot_id assert restored.created_at == snap.created_at assert restored.schema_name == snap.schema_name assert restored.entity_count == snap.entity_count def test_round_trip_preserves_entity_evaluations(self, tmp_path): snap = _sample_snapshot() p = tmp_path / "snapshot.yaml" write_snapshot(snap, p) restored = read_snapshot(p) assert len(restored.entity_evaluations) == 1 ev = restored.entity_evaluations[0] assert ev.entity_slug == "division-of-labour" assert len(ev.scores) == 3 def test_round_trip_preserves_collection_metrics(self, tmp_path): snap = _sample_snapshot() p = tmp_path / "snapshot.yaml" write_snapshot(snap, p) restored = read_snapshot(p) assert len(restored.collection_metrics) == 1 m = restored.collection_metrics[0] assert m.name == "coverage_ratio" assert m.value == 0.85 assert m.concern == "C2" # ── History ────────────────────────────────────────────────────────── class TestHistory: def test_append_creates_new_file(self, tmp_path): snap = _sample_snapshot() hp = tmp_path / "history.yaml" append_to_history(snap, hp) assert hp.exists() history = read_history(hp) assert len(history) == 1 def test_append_adds_to_existing(self, tmp_path): hp = tmp_path / "history.yaml" snap1 = _sample_snapshot(snapshot_id="snap-1") snap2 = _sample_snapshot(snapshot_id="snap-2") append_to_history(snap1, hp) append_to_history(snap2, hp) history = read_history(hp) assert len(history) == 2 assert history[0].snapshot_id == "snap-1" assert history[1].snapshot_id == "snap-2" def test_multiple_appends_all_preserved(self, tmp_path): hp = tmp_path / "history.yaml" for i in range(5): snap = _sample_snapshot(snapshot_id=f"snap-{i}") append_to_history(snap, hp) history = read_history(hp) assert len(history) == 5 assert [h.snapshot_id for h in history] == [f"snap-{i}" for i in range(5)] def test_read_history_returns_list_in_order(self, tmp_path): hp = tmp_path / "history.yaml" snap_a = _sample_snapshot(snapshot_id="a") snap_b = _sample_snapshot(snapshot_id="b") append_to_history(snap_a, hp) append_to_history(snap_b, hp) history = read_history(hp) assert history[0].snapshot_id == "a" assert history[1].snapshot_id == "b" # ── Diffing ────────────────────────────────────────────────────────── class TestDiffSnapshots: def test_identical_snapshots_empty_diff(self): snap = _sample_snapshot() diff = diff_snapshots(snap, snap) assert diff.added_entities == [] assert diff.removed_entities == [] assert diff.score_changes == [] assert diff.metric_changes == [] def test_added_entity(self): before = _sample_snapshot(entity_evaluations=[]) after = _sample_snapshot() diff = diff_snapshots(before, after) assert "division-of-labour" in diff.added_entities assert diff.removed_entities == [] def test_removed_entity(self): before = _sample_snapshot() after = _sample_snapshot(entity_evaluations=[]) diff = diff_snapshots(before, after) assert "division-of-labour" in diff.removed_entities assert diff.added_entities == [] def test_changed_score(self): ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)]) ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)]) before = _sample_snapshot(entity_evaluations=[ev_before]) after = _sample_snapshot(entity_evaluations=[ev_after]) diff = diff_snapshots(before, after) assert len(diff.score_changes) == 1 sc = diff.score_changes[0] assert sc.entity_slug == "division-of-labour" assert sc.dimension == "precision" assert sc.before == 4.0 assert sc.after == 4.8 def test_changed_metric(self): before = _sample_snapshot( collection_metrics=[MetricValue("coverage_ratio", 0.80)] ) after = _sample_snapshot( collection_metrics=[MetricValue("coverage_ratio", 0.90)] ) diff = diff_snapshots(before, after) assert len(diff.metric_changes) == 1 mc = diff.metric_changes[0] assert mc.name == "coverage_ratio" assert mc.before == 0.80 assert mc.after == 0.90 def test_summary_readable(self): ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)]) ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)]) before = _sample_snapshot( snapshot_id="snap-1", entity_evaluations=[ev_before], collection_metrics=[MetricValue("coverage", 0.80)], ) after = _sample_snapshot( snapshot_id="snap-2", entity_evaluations=[ev_after], collection_metrics=[MetricValue("coverage", 0.90)], ) diff = diff_snapshots(before, after) text = diff.summary() assert "snap-1" in text assert "snap-2" in text assert "precision" in text assert "coverage" in text def test_summary_no_changes(self): snap = _sample_snapshot() diff = diff_snapshots(snap, snap) text = diff.summary() assert "No changes" in text