generated from coulomb/repo-seed
Initial implementation
This commit is contained in:
78
tests/test_evaluation.py
Normal file
78
tests/test_evaluation.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from infospace_bench.evaluation import (
|
||||
EntityEvaluation,
|
||||
EvaluationSnapshot,
|
||||
MetricValue,
|
||||
ScoreEntry,
|
||||
diff_snapshots,
|
||||
)
|
||||
|
||||
|
||||
def test_entity_evaluation_round_trips_and_computes_overall_score() -> None:
|
||||
evaluated_at = datetime(2026, 5, 14, tzinfo=timezone.utc)
|
||||
evaluation = EntityEvaluation(
|
||||
artifact_id="source/chapter.md",
|
||||
evaluator="test",
|
||||
scores=[
|
||||
ScoreEntry("definition_precision", 4),
|
||||
ScoreEntry("provenance_quality", 5),
|
||||
],
|
||||
evaluated_at=evaluated_at,
|
||||
notes=["clear enough"],
|
||||
)
|
||||
|
||||
payload = evaluation.to_dict()
|
||||
loaded = EntityEvaluation.from_dict(payload)
|
||||
|
||||
assert payload["overall_score"] == 4.5
|
||||
assert loaded == evaluation
|
||||
|
||||
|
||||
def test_snapshot_diff_reports_added_removed_score_and_metric_changes() -> None:
|
||||
now = datetime(2026, 5, 14, tzinfo=timezone.utc)
|
||||
before = EvaluationSnapshot(
|
||||
snapshot_id="before",
|
||||
created_at=now,
|
||||
schema_name="baseline",
|
||||
artifact_count=1,
|
||||
artifact_evaluations=[
|
||||
EntityEvaluation(
|
||||
artifact_id="source/a.md",
|
||||
evaluator="test",
|
||||
scores=[ScoreEntry("quality", 3)],
|
||||
evaluated_at=now,
|
||||
)
|
||||
],
|
||||
collection_metrics=[MetricValue("coverage_ratio", 0.5)],
|
||||
)
|
||||
after = EvaluationSnapshot(
|
||||
snapshot_id="after",
|
||||
created_at=now,
|
||||
schema_name="baseline",
|
||||
artifact_count=1,
|
||||
artifact_evaluations=[
|
||||
EntityEvaluation(
|
||||
artifact_id="source/a.md",
|
||||
evaluator="test",
|
||||
scores=[ScoreEntry("quality", 4)],
|
||||
evaluated_at=now,
|
||||
),
|
||||
EntityEvaluation(
|
||||
artifact_id="source/b.md",
|
||||
evaluator="test",
|
||||
scores=[ScoreEntry("quality", 5)],
|
||||
evaluated_at=now,
|
||||
),
|
||||
],
|
||||
collection_metrics=[MetricValue("coverage_ratio", 0.75)],
|
||||
)
|
||||
|
||||
diff = diff_snapshots(before, after)
|
||||
|
||||
assert diff.added_artifacts == ["source/b.md"]
|
||||
assert diff.removed_artifacts == []
|
||||
assert diff.score_changes[0].artifact_id == "source/a.md"
|
||||
assert diff.score_changes[0].delta == 1
|
||||
assert diff.metric_changes[0].name == "coverage_ratio"
|
||||
assert diff.metric_changes[0].delta == 0.25
|
||||
Reference in New Issue
Block a user