Files
infospace-bench/tests/test_evaluation.py
2026-05-14 11:32:25 +02:00

79 lines
2.3 KiB
Python

from datetime import datetime, timezone
from infospace_bench.evaluation import (
EntityEvaluation,
EvaluationSnapshot,
MetricValue,
ScoreEntry,
diff_snapshots,
)
def test_entity_evaluation_round_trips_and_computes_overall_score() -> None:
evaluated_at = datetime(2026, 5, 14, tzinfo=timezone.utc)
evaluation = EntityEvaluation(
artifact_id="source/chapter.md",
evaluator="test",
scores=[
ScoreEntry("definition_precision", 4),
ScoreEntry("provenance_quality", 5),
],
evaluated_at=evaluated_at,
notes=["clear enough"],
)
payload = evaluation.to_dict()
loaded = EntityEvaluation.from_dict(payload)
assert payload["overall_score"] == 4.5
assert loaded == evaluation
def test_snapshot_diff_reports_added_removed_score_and_metric_changes() -> None:
now = datetime(2026, 5, 14, tzinfo=timezone.utc)
before = EvaluationSnapshot(
snapshot_id="before",
created_at=now,
schema_name="baseline",
artifact_count=1,
artifact_evaluations=[
EntityEvaluation(
artifact_id="source/a.md",
evaluator="test",
scores=[ScoreEntry("quality", 3)],
evaluated_at=now,
)
],
collection_metrics=[MetricValue("coverage_ratio", 0.5)],
)
after = EvaluationSnapshot(
snapshot_id="after",
created_at=now,
schema_name="baseline",
artifact_count=1,
artifact_evaluations=[
EntityEvaluation(
artifact_id="source/a.md",
evaluator="test",
scores=[ScoreEntry("quality", 4)],
evaluated_at=now,
),
EntityEvaluation(
artifact_id="source/b.md",
evaluator="test",
scores=[ScoreEntry("quality", 5)],
evaluated_at=now,
),
],
collection_metrics=[MetricValue("coverage_ratio", 0.75)],
)
diff = diff_snapshots(before, after)
assert diff.added_artifacts == ["source/b.md"]
assert diff.removed_artifacts == []
assert diff.score_changes[0].artifact_id == "source/a.md"
assert diff.score_changes[0].delta == 1
assert diff.metric_changes[0].name == "coverage_ratio"
assert diff.metric_changes[0].delta == 0.25