from datetime import datetime, timezone from infospace_bench.evaluation import ( EntityEvaluation, EvaluationSnapshot, MetricValue, ScoreEntry, diff_snapshots, ) def test_entity_evaluation_round_trips_and_computes_overall_score() -> None: evaluated_at = datetime(2026, 5, 14, tzinfo=timezone.utc) evaluation = EntityEvaluation( artifact_id="source/chapter.md", evaluator="test", scores=[ ScoreEntry("definition_precision", 4), ScoreEntry("provenance_quality", 5), ], evaluated_at=evaluated_at, notes=["clear enough"], ) payload = evaluation.to_dict() loaded = EntityEvaluation.from_dict(payload) assert payload["overall_score"] == 4.5 assert loaded == evaluation def test_snapshot_diff_reports_added_removed_score_and_metric_changes() -> None: now = datetime(2026, 5, 14, tzinfo=timezone.utc) before = EvaluationSnapshot( snapshot_id="before", created_at=now, schema_name="baseline", artifact_count=1, artifact_evaluations=[ EntityEvaluation( artifact_id="source/a.md", evaluator="test", scores=[ScoreEntry("quality", 3)], evaluated_at=now, ) ], collection_metrics=[MetricValue("coverage_ratio", 0.5)], ) after = EvaluationSnapshot( snapshot_id="after", created_at=now, schema_name="baseline", artifact_count=1, artifact_evaluations=[ EntityEvaluation( artifact_id="source/a.md", evaluator="test", scores=[ScoreEntry("quality", 4)], evaluated_at=now, ), EntityEvaluation( artifact_id="source/b.md", evaluator="test", scores=[ScoreEntry("quality", 5)], evaluated_at=now, ), ], collection_metrics=[MetricValue("coverage_ratio", 0.75)], ) diff = diff_snapshots(before, after) assert diff.added_artifacts == ["source/b.md"] assert diff.removed_artifacts == [] assert diff.score_changes[0].artifact_id == "source/a.md" assert diff.score_changes[0].delta == 1 assert diff.metric_changes[0].name == "coverage_ratio" assert diff.metric_changes[0].delta == 0.25