feat(infospace): add structured evaluation output with history and diffing (S1.5)

Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot,
SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files,
snapshot persistence, history append, and snapshot diffing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 01:35:22 +01:00
parent bad01e32bd
commit f8c9ab33f0
4 changed files with 852 additions and 0 deletions

View File

@@ -21,6 +21,24 @@ from .validator import (
validate_entities,
validate_entity,
)
from .evaluation import (
EntityEvaluation,
EvaluationSnapshot,
MetricChange,
MetricValue,
ScoreChange,
ScoreEntry,
SnapshotDiff,
)
from .evaluation_io import (
append_to_history,
diff_snapshots,
read_entity_evaluation,
read_history,
read_snapshot,
write_entity_evaluation,
write_snapshot,
)
__all__ = [
"EntityMeta",
@@ -38,4 +56,20 @@ __all__ = [
"ComplianceResult",
"validate_entities",
"validate_entity",
# Evaluation models
"EntityEvaluation",
"EvaluationSnapshot",
"MetricChange",
"MetricValue",
"ScoreChange",
"ScoreEntry",
"SnapshotDiff",
# Evaluation I/O
"append_to_history",
"diff_snapshots",
"read_entity_evaluation",
"read_history",
"read_snapshot",
"write_entity_evaluation",
"write_snapshot",
]

View File

@@ -0,0 +1,207 @@
"""
Data models for structured evaluation output.
Provides typed containers for per-entity LLM-evaluated scores and
collection-level metrics. All models support ``to_dict()``/``from_dict()``
round-tripping for YAML serialisation.
"""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, List, Optional
@dataclass
class ScoreEntry:
"""A single scored dimension (e.g. definition_precision: 4.5/5.0)."""
name: str
value: float
max_value: float = 5.0
rationale: str = ""
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {
"name": self.name,
"value": self.value,
"max_value": self.max_value,
}
if self.rationale:
d["rationale"] = self.rationale
return d
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ScoreEntry":
return cls(
name=data["name"],
value=float(data["value"]),
max_value=float(data.get("max_value", 5.0)),
rationale=data.get("rationale", ""),
)
@dataclass
class EntityEvaluation:
"""Per-entity evaluation result."""
entity_slug: str
evaluator: str
scores: List[ScoreEntry]
evaluated_at: datetime
notes: List[str] = field(default_factory=list)
@property
def overall_score(self) -> float:
if not self.scores:
return 0.0
return sum(s.value for s in self.scores) / len(self.scores)
def to_dict(self) -> Dict[str, Any]:
return {
"entity_slug": self.entity_slug,
"evaluator": self.evaluator,
"evaluated_at": self.evaluated_at.isoformat(),
"overall_score": round(self.overall_score, 4),
"scores": [s.to_dict() for s in self.scores],
"notes": self.notes,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "EntityEvaluation":
return cls(
entity_slug=data["entity_slug"],
evaluator=data["evaluator"],
scores=[ScoreEntry.from_dict(s) for s in data["scores"]],
evaluated_at=datetime.fromisoformat(data["evaluated_at"]),
notes=data.get("notes", []),
)
@dataclass
class MetricValue:
"""A single collection-level metric."""
name: str
value: float
concern: str = ""
details: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {"name": self.name, "value": self.value}
if self.concern:
d["concern"] = self.concern
if self.details:
d["details"] = self.details
return d
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "MetricValue":
return cls(
name=data["name"],
value=float(data["value"]),
concern=data.get("concern", ""),
details=data.get("details", {}),
)
@dataclass
class EvaluationSnapshot:
"""Timestamped snapshot of entity evaluations and collection metrics."""
snapshot_id: str
created_at: datetime
schema_name: str
entity_count: int
entity_evaluations: List[EntityEvaluation] = field(default_factory=list)
collection_metrics: List[MetricValue] = field(default_factory=list)
metadata: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return {
"snapshot_id": self.snapshot_id,
"created_at": self.created_at.isoformat(),
"schema_name": self.schema_name,
"entity_count": self.entity_count,
"entity_evaluations": [e.to_dict() for e in self.entity_evaluations],
"collection_metrics": [m.to_dict() for m in self.collection_metrics],
"metadata": self.metadata,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "EvaluationSnapshot":
return cls(
snapshot_id=data["snapshot_id"],
created_at=datetime.fromisoformat(data["created_at"]),
schema_name=data["schema_name"],
entity_count=data["entity_count"],
entity_evaluations=[
EntityEvaluation.from_dict(e) for e in data.get("entity_evaluations", [])
],
collection_metrics=[
MetricValue.from_dict(m) for m in data.get("collection_metrics", [])
],
metadata=data.get("metadata", {}),
)
@dataclass
class ScoreChange:
"""Delta record for a single score dimension between snapshots."""
entity_slug: str
dimension: str
before: float
after: float
@property
def delta(self) -> float:
return self.after - self.before
@dataclass
class MetricChange:
"""Delta record for a collection metric between snapshots."""
name: str
before: float
after: float
@property
def delta(self) -> float:
return self.after - self.before
@dataclass
class SnapshotDiff:
"""Diff between two evaluation snapshots."""
before_id: str
after_id: str
added_entities: List[str] = field(default_factory=list)
removed_entities: List[str] = field(default_factory=list)
score_changes: List[ScoreChange] = field(default_factory=list)
metric_changes: List[MetricChange] = field(default_factory=list)
def summary(self) -> str:
lines = [f"Diff: {self.before_id} -> {self.after_id}"]
if self.added_entities:
lines.append(f" Added entities: {', '.join(self.added_entities)}")
if self.removed_entities:
lines.append(f" Removed entities: {', '.join(self.removed_entities)}")
if self.score_changes:
lines.append(f" Score changes: {len(self.score_changes)}")
for sc in self.score_changes:
lines.append(
f" {sc.entity_slug}/{sc.dimension}: "
f"{sc.before} -> {sc.after} ({sc.delta:+.2f})"
)
if self.metric_changes:
lines.append(f" Metric changes: {len(self.metric_changes)}")
for mc in self.metric_changes:
lines.append(
f" {mc.name}: {mc.before} -> {mc.after} ({mc.delta:+.2f})"
)
if not any([self.added_entities, self.removed_entities,
self.score_changes, self.metric_changes]):
lines.append(" No changes")
return "\n".join(lines)

View File

@@ -0,0 +1,213 @@
"""
Read/write utilities for evaluation output files.
Per-entity evaluations are stored as markdown with YAML frontmatter.
Snapshots and history are stored as pure YAML files.
"""
from pathlib import Path
from typing import List
import yaml
from .evaluation import (
EntityEvaluation,
EvaluationSnapshot,
MetricChange,
MetricValue,
ScoreChange,
SnapshotDiff,
)
_FRONTMATTER_SEP = "---"
def write_entity_evaluation(evaluation: EntityEvaluation, path: Path) -> None:
"""Write a per-entity evaluation as YAML frontmatter + markdown body."""
frontmatter = {
"entity_slug": evaluation.entity_slug,
"evaluator": evaluation.evaluator,
"evaluated_at": evaluation.evaluated_at.isoformat(),
"overall_score": round(evaluation.overall_score, 4),
"scores": [s.to_dict() for s in evaluation.scores],
}
if evaluation.notes:
frontmatter["notes"] = evaluation.notes
lines: List[str] = []
lines.append(_FRONTMATTER_SEP)
lines.append(yaml.safe_dump(frontmatter, default_flow_style=False, sort_keys=False).rstrip())
lines.append(_FRONTMATTER_SEP)
lines.append("")
# Title
title = evaluation.entity_slug.replace("_", " ").replace("-", " ").title()
lines.append(f"# Evaluation: {title}")
lines.append("")
# One section per score with rationale
for score in evaluation.scores:
lines.append(f"## {score.name}{score.value} / {score.max_value}")
lines.append("")
if score.rationale:
lines.append(score.rationale)
lines.append("")
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines), encoding="utf-8")
def read_entity_evaluation(path: Path) -> EntityEvaluation:
"""Read a per-entity evaluation from a YAML frontmatter markdown file."""
text = path.read_text(encoding="utf-8")
parts = text.split(f"{_FRONTMATTER_SEP}\n", maxsplit=2)
# parts: ["", frontmatter_text, body]
if len(parts) < 3:
raise ValueError(f"Invalid frontmatter in {path}")
fm_text = parts[1]
body = parts[2]
fm = yaml.safe_load(fm_text)
# Parse rationales from body
rationales = _parse_rationales(body)
from .evaluation import ScoreEntry
scores = []
for s_data in fm["scores"]:
se = ScoreEntry.from_dict(s_data)
if se.name in rationales:
se.rationale = rationales[se.name]
scores.append(se)
return EntityEvaluation(
entity_slug=fm["entity_slug"],
evaluator=fm["evaluator"],
scores=scores,
evaluated_at=__import__("datetime").datetime.fromisoformat(fm["evaluated_at"]),
notes=fm.get("notes", []),
)
def _parse_rationales(body: str) -> dict:
"""Extract rationale text per dimension from the markdown body."""
rationales: dict = {}
current_name = None
current_lines: List[str] = []
for line in body.splitlines():
if line.startswith("## "):
# Save previous
if current_name is not None:
rationales[current_name] = "\n".join(current_lines).strip()
# Parse "## dimension_name — 4.5 / 5.0"
heading = line[3:].strip()
name = heading.split("")[0].strip() if "" in heading else heading
current_name = name
current_lines = []
elif current_name is not None:
current_lines.append(line)
if current_name is not None:
rationales[current_name] = "\n".join(current_lines).strip()
return rationales
def write_snapshot(snapshot: EvaluationSnapshot, path: Path) -> None:
"""Write an evaluation snapshot as a YAML file."""
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
yaml.safe_dump(snapshot.to_dict(), default_flow_style=False, sort_keys=False),
encoding="utf-8",
)
def read_snapshot(path: Path) -> EvaluationSnapshot:
"""Read an evaluation snapshot from a YAML file."""
data = yaml.safe_load(path.read_text(encoding="utf-8"))
return EvaluationSnapshot.from_dict(data)
def append_to_history(snapshot: EvaluationSnapshot, history_path: Path) -> None:
"""Append a snapshot to a YAML list file (creates if missing)."""
history_path.parent.mkdir(parents=True, exist_ok=True)
existing: List[dict] = []
if history_path.exists():
loaded = yaml.safe_load(history_path.read_text(encoding="utf-8"))
if loaded is not None:
existing = loaded
existing.append(snapshot.to_dict())
history_path.write_text(
yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
encoding="utf-8",
)
def read_history(history_path: Path) -> List[EvaluationSnapshot]:
"""Read all snapshots from a YAML history file."""
data = yaml.safe_load(history_path.read_text(encoding="utf-8"))
if data is None:
return []
return [EvaluationSnapshot.from_dict(d) for d in data]
def diff_snapshots(before: EvaluationSnapshot, after: EvaluationSnapshot) -> SnapshotDiff:
"""Compute the diff between two evaluation snapshots."""
before_slugs = {e.entity_slug for e in before.entity_evaluations}
after_slugs = {e.entity_slug for e in after.entity_evaluations}
added = sorted(after_slugs - before_slugs)
removed = sorted(before_slugs - after_slugs)
# Build score lookup: {slug: {dimension: value}}
before_scores: dict = {}
for ev in before.entity_evaluations:
before_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores}
after_scores: dict = {}
for ev in after.entity_evaluations:
after_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores}
score_changes: List[ScoreChange] = []
common_slugs = sorted(before_slugs & after_slugs)
for slug in common_slugs:
b_dims = before_scores[slug]
a_dims = after_scores[slug]
all_dims = sorted(set(b_dims) | set(a_dims))
for dim in all_dims:
bv = b_dims.get(dim)
av = a_dims.get(dim)
if bv != av:
score_changes.append(ScoreChange(
entity_slug=slug,
dimension=dim,
before=bv if bv is not None else 0.0,
after=av if av is not None else 0.0,
))
# Metric changes
before_metrics = {m.name: m.value for m in before.collection_metrics}
after_metrics = {m.name: m.value for m in after.collection_metrics}
all_metric_names = sorted(set(before_metrics) | set(after_metrics))
metric_changes: List[MetricChange] = []
for name in all_metric_names:
bv = before_metrics.get(name)
av = after_metrics.get(name)
if bv != av:
metric_changes.append(MetricChange(
name=name,
before=bv if bv is not None else 0.0,
after=av if av is not None else 0.0,
))
return SnapshotDiff(
before_id=before.snapshot_id,
after_id=after.snapshot_id,
added_entities=added,
removed_entities=removed,
score_changes=score_changes,
metric_changes=metric_changes,
)

View File

@@ -0,0 +1,398 @@
"""Tests for markitect.infospace evaluation models and I/O."""
from datetime import datetime
from pathlib import Path
import pytest
from markitect.infospace import (
EntityEvaluation,
EvaluationSnapshot,
MetricChange,
MetricValue,
ScoreChange,
ScoreEntry,
SnapshotDiff,
append_to_history,
diff_snapshots,
read_entity_evaluation,
read_history,
read_snapshot,
write_entity_evaluation,
write_snapshot,
)
# ── Helpers ──────────────────────────────────────────────────────────
_NOW = datetime(2026, 2, 19, 12, 0, 0)
def _sample_scores() -> list:
return [
ScoreEntry("definition_precision", 4.5, rationale="Clear and specific."),
ScoreEntry("source_grounding", 4.0, rationale="Well grounded."),
ScoreEntry("domain_relevance", 4.5),
]
def _sample_evaluation(**overrides) -> EntityEvaluation:
defaults = dict(
entity_slug="division-of-labour",
evaluator="openrouter/anthropic/claude-3.5-sonnet",
scores=_sample_scores(),
evaluated_at=_NOW,
notes=["Strong entity with clear provenance"],
)
defaults.update(overrides)
return EntityEvaluation(**defaults)
def _sample_metric() -> MetricValue:
return MetricValue("coverage_ratio", 0.85, concern="C2", details={"checked": 85})
def _sample_snapshot(**overrides) -> EvaluationSnapshot:
defaults = dict(
snapshot_id="2026-02-19",
created_at=_NOW,
schema_name="Economic Entity",
entity_count=1,
entity_evaluations=[_sample_evaluation()],
collection_metrics=[_sample_metric()],
metadata={"version": "1.0"},
)
defaults.update(overrides)
return EvaluationSnapshot(**defaults)
# ── Model tests ──────────────────────────────────────────────────────
class TestScoreEntry:
def test_to_dict_from_dict_round_trip(self):
se = ScoreEntry("precision", 4.5, 5.0, "Good definition.")
d = se.to_dict()
restored = ScoreEntry.from_dict(d)
assert restored.name == se.name
assert restored.value == se.value
assert restored.max_value == se.max_value
assert restored.rationale == se.rationale
def test_to_dict_omits_empty_rationale(self):
se = ScoreEntry("precision", 4.5)
d = se.to_dict()
assert "rationale" not in d
def test_from_dict_defaults(self):
se = ScoreEntry.from_dict({"name": "x", "value": 3.0})
assert se.max_value == 5.0
assert se.rationale == ""
class TestEntityEvaluation:
def test_overall_score_is_mean(self):
ev = _sample_evaluation()
# (4.5 + 4.0 + 4.5) / 3 ≈ 4.333
assert abs(ev.overall_score - 4.333333) < 0.001
def test_overall_score_zero_scores(self):
ev = _sample_evaluation(scores=[])
assert ev.overall_score == 0.0
def test_to_dict_from_dict_round_trip(self):
ev = _sample_evaluation()
d = ev.to_dict()
restored = EntityEvaluation.from_dict(d)
assert restored.entity_slug == ev.entity_slug
assert restored.evaluator == ev.evaluator
assert len(restored.scores) == len(ev.scores)
assert restored.evaluated_at == ev.evaluated_at
assert restored.notes == ev.notes
def test_to_dict_includes_overall_score(self):
ev = _sample_evaluation()
d = ev.to_dict()
assert "overall_score" in d
assert abs(d["overall_score"] - 4.3333) < 0.01
class TestMetricValue:
def test_to_dict_from_dict_round_trip(self):
mv = _sample_metric()
d = mv.to_dict()
restored = MetricValue.from_dict(d)
assert restored.name == mv.name
assert restored.value == mv.value
assert restored.concern == mv.concern
assert restored.details == mv.details
def test_to_dict_omits_empty_concern(self):
mv = MetricValue("x", 1.0)
d = mv.to_dict()
assert "concern" not in d
assert "details" not in d
class TestEvaluationSnapshot:
def test_to_dict_from_dict_round_trip(self):
snap = _sample_snapshot()
d = snap.to_dict()
restored = EvaluationSnapshot.from_dict(d)
assert restored.snapshot_id == snap.snapshot_id
assert restored.created_at == snap.created_at
assert restored.schema_name == snap.schema_name
assert restored.entity_count == snap.entity_count
assert len(restored.entity_evaluations) == 1
assert len(restored.collection_metrics) == 1
assert restored.metadata == snap.metadata
def test_from_dict_empty_lists(self):
d = {
"snapshot_id": "test",
"created_at": _NOW.isoformat(),
"schema_name": "Test",
"entity_count": 0,
}
snap = EvaluationSnapshot.from_dict(d)
assert snap.entity_evaluations == []
assert snap.collection_metrics == []
assert snap.metadata == {}
# ── Per-entity file I/O ──────────────────────────────────────────────
class TestEntityEvaluationIO:
def test_write_creates_file(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
assert p.exists()
def test_file_has_yaml_frontmatter(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
text = p.read_text()
assert text.startswith("---\n")
assert "\n---\n" in text
def test_frontmatter_contains_expected_keys(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
text = p.read_text()
for key in ["entity_slug", "evaluator", "evaluated_at", "overall_score", "scores"]:
assert key in text
def test_markdown_body_contains_rationales(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
text = p.read_text()
assert "Clear and specific." in text
assert "Well grounded." in text
assert "## definition_precision" in text
def test_read_back_matches_original(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
restored = read_entity_evaluation(p)
assert restored.entity_slug == ev.entity_slug
assert restored.evaluator == ev.evaluator
assert restored.evaluated_at == ev.evaluated_at
assert restored.notes == ev.notes
assert len(restored.scores) == len(ev.scores)
def test_round_trip_preserves_scores(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
restored = read_entity_evaluation(p)
for orig, rest in zip(ev.scores, restored.scores):
assert rest.name == orig.name
assert rest.value == orig.value
assert rest.max_value == orig.max_value
def test_round_trip_preserves_rationales(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "eval.md"
write_entity_evaluation(ev, p)
restored = read_entity_evaluation(p)
assert restored.scores[0].rationale == "Clear and specific."
assert restored.scores[1].rationale == "Well grounded."
# Third score has no rationale
assert restored.scores[2].rationale == ""
def test_write_creates_parent_dirs(self, tmp_path):
ev = _sample_evaluation()
p = tmp_path / "deep" / "nested" / "eval.md"
write_entity_evaluation(ev, p)
assert p.exists()
# ── Snapshot I/O ─────────────────────────────────────────────────────
class TestSnapshotIO:
def test_write_creates_file(self, tmp_path):
snap = _sample_snapshot()
p = tmp_path / "snapshot.yaml"
write_snapshot(snap, p)
assert p.exists()
def test_read_back_matches_original(self, tmp_path):
snap = _sample_snapshot()
p = tmp_path / "snapshot.yaml"
write_snapshot(snap, p)
restored = read_snapshot(p)
assert restored.snapshot_id == snap.snapshot_id
assert restored.created_at == snap.created_at
assert restored.schema_name == snap.schema_name
assert restored.entity_count == snap.entity_count
def test_round_trip_preserves_entity_evaluations(self, tmp_path):
snap = _sample_snapshot()
p = tmp_path / "snapshot.yaml"
write_snapshot(snap, p)
restored = read_snapshot(p)
assert len(restored.entity_evaluations) == 1
ev = restored.entity_evaluations[0]
assert ev.entity_slug == "division-of-labour"
assert len(ev.scores) == 3
def test_round_trip_preserves_collection_metrics(self, tmp_path):
snap = _sample_snapshot()
p = tmp_path / "snapshot.yaml"
write_snapshot(snap, p)
restored = read_snapshot(p)
assert len(restored.collection_metrics) == 1
m = restored.collection_metrics[0]
assert m.name == "coverage_ratio"
assert m.value == 0.85
assert m.concern == "C2"
# ── History ──────────────────────────────────────────────────────────
class TestHistory:
def test_append_creates_new_file(self, tmp_path):
snap = _sample_snapshot()
hp = tmp_path / "history.yaml"
append_to_history(snap, hp)
assert hp.exists()
history = read_history(hp)
assert len(history) == 1
def test_append_adds_to_existing(self, tmp_path):
hp = tmp_path / "history.yaml"
snap1 = _sample_snapshot(snapshot_id="snap-1")
snap2 = _sample_snapshot(snapshot_id="snap-2")
append_to_history(snap1, hp)
append_to_history(snap2, hp)
history = read_history(hp)
assert len(history) == 2
assert history[0].snapshot_id == "snap-1"
assert history[1].snapshot_id == "snap-2"
def test_multiple_appends_all_preserved(self, tmp_path):
hp = tmp_path / "history.yaml"
for i in range(5):
snap = _sample_snapshot(snapshot_id=f"snap-{i}")
append_to_history(snap, hp)
history = read_history(hp)
assert len(history) == 5
assert [h.snapshot_id for h in history] == [f"snap-{i}" for i in range(5)]
def test_read_history_returns_list_in_order(self, tmp_path):
hp = tmp_path / "history.yaml"
snap_a = _sample_snapshot(snapshot_id="a")
snap_b = _sample_snapshot(snapshot_id="b")
append_to_history(snap_a, hp)
append_to_history(snap_b, hp)
history = read_history(hp)
assert history[0].snapshot_id == "a"
assert history[1].snapshot_id == "b"
# ── Diffing ──────────────────────────────────────────────────────────
class TestDiffSnapshots:
def test_identical_snapshots_empty_diff(self):
snap = _sample_snapshot()
diff = diff_snapshots(snap, snap)
assert diff.added_entities == []
assert diff.removed_entities == []
assert diff.score_changes == []
assert diff.metric_changes == []
def test_added_entity(self):
before = _sample_snapshot(entity_evaluations=[])
after = _sample_snapshot()
diff = diff_snapshots(before, after)
assert "division-of-labour" in diff.added_entities
assert diff.removed_entities == []
def test_removed_entity(self):
before = _sample_snapshot()
after = _sample_snapshot(entity_evaluations=[])
diff = diff_snapshots(before, after)
assert "division-of-labour" in diff.removed_entities
assert diff.added_entities == []
def test_changed_score(self):
ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
before = _sample_snapshot(entity_evaluations=[ev_before])
after = _sample_snapshot(entity_evaluations=[ev_after])
diff = diff_snapshots(before, after)
assert len(diff.score_changes) == 1
sc = diff.score_changes[0]
assert sc.entity_slug == "division-of-labour"
assert sc.dimension == "precision"
assert sc.before == 4.0
assert sc.after == 4.8
def test_changed_metric(self):
before = _sample_snapshot(
collection_metrics=[MetricValue("coverage_ratio", 0.80)]
)
after = _sample_snapshot(
collection_metrics=[MetricValue("coverage_ratio", 0.90)]
)
diff = diff_snapshots(before, after)
assert len(diff.metric_changes) == 1
mc = diff.metric_changes[0]
assert mc.name == "coverage_ratio"
assert mc.before == 0.80
assert mc.after == 0.90
def test_summary_readable(self):
ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
before = _sample_snapshot(
snapshot_id="snap-1",
entity_evaluations=[ev_before],
collection_metrics=[MetricValue("coverage", 0.80)],
)
after = _sample_snapshot(
snapshot_id="snap-2",
entity_evaluations=[ev_after],
collection_metrics=[MetricValue("coverage", 0.90)],
)
diff = diff_snapshots(before, after)
text = diff.summary()
assert "snap-1" in text
assert "snap-2" in text
assert "precision" in text
assert "coverage" in text
def test_summary_no_changes(self):
snap = _sample_snapshot()
diff = diff_snapshots(snap, snap)
text = diff.summary()
assert "No changes" in text