From f8c9ab33f02926ecabce0d02006855f9423d2b38 Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 19 Feb 2026 01:35:22 +0100 Subject: [PATCH] feat(infospace): add structured evaluation output with history and diffing (S1.5) Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot, SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files, snapshot persistence, history append, and snapshot diffing. Co-Authored-By: Claude Opus 4.6 --- markitect/infospace/__init__.py | 34 ++ markitect/infospace/evaluation.py | 207 ++++++++++++ markitect/infospace/evaluation_io.py | 213 +++++++++++++ tests/unit/infospace/test_evaluation.py | 398 ++++++++++++++++++++++++ 4 files changed, 852 insertions(+) create mode 100644 markitect/infospace/evaluation.py create mode 100644 markitect/infospace/evaluation_io.py create mode 100644 tests/unit/infospace/test_evaluation.py diff --git a/markitect/infospace/__init__.py b/markitect/infospace/__init__.py index 01c6a57a..addcfcd5 100644 --- a/markitect/infospace/__init__.py +++ b/markitect/infospace/__init__.py @@ -21,6 +21,24 @@ from .validator import ( validate_entities, validate_entity, ) +from .evaluation import ( + EntityEvaluation, + EvaluationSnapshot, + MetricChange, + MetricValue, + ScoreChange, + ScoreEntry, + SnapshotDiff, +) +from .evaluation_io import ( + append_to_history, + diff_snapshots, + read_entity_evaluation, + read_history, + read_snapshot, + write_entity_evaluation, + write_snapshot, +) __all__ = [ "EntityMeta", @@ -38,4 +56,20 @@ __all__ = [ "ComplianceResult", "validate_entities", "validate_entity", + # Evaluation models + "EntityEvaluation", + "EvaluationSnapshot", + "MetricChange", + "MetricValue", + "ScoreChange", + "ScoreEntry", + "SnapshotDiff", + # Evaluation I/O + "append_to_history", + "diff_snapshots", + "read_entity_evaluation", + "read_history", + "read_snapshot", + "write_entity_evaluation", + "write_snapshot", ] diff --git a/markitect/infospace/evaluation.py b/markitect/infospace/evaluation.py new file mode 100644 index 00000000..303e4926 --- /dev/null +++ b/markitect/infospace/evaluation.py @@ -0,0 +1,207 @@ +""" +Data models for structured evaluation output. + +Provides typed containers for per-entity LLM-evaluated scores and +collection-level metrics. All models support ``to_dict()``/``from_dict()`` +round-tripping for YAML serialisation. +""" + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Any, Dict, List, Optional + + +@dataclass +class ScoreEntry: + """A single scored dimension (e.g. definition_precision: 4.5/5.0).""" + + name: str + value: float + max_value: float = 5.0 + rationale: str = "" + + def to_dict(self) -> Dict[str, Any]: + d: Dict[str, Any] = { + "name": self.name, + "value": self.value, + "max_value": self.max_value, + } + if self.rationale: + d["rationale"] = self.rationale + return d + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "ScoreEntry": + return cls( + name=data["name"], + value=float(data["value"]), + max_value=float(data.get("max_value", 5.0)), + rationale=data.get("rationale", ""), + ) + + +@dataclass +class EntityEvaluation: + """Per-entity evaluation result.""" + + entity_slug: str + evaluator: str + scores: List[ScoreEntry] + evaluated_at: datetime + notes: List[str] = field(default_factory=list) + + @property + def overall_score(self) -> float: + if not self.scores: + return 0.0 + return sum(s.value for s in self.scores) / len(self.scores) + + def to_dict(self) -> Dict[str, Any]: + return { + "entity_slug": self.entity_slug, + "evaluator": self.evaluator, + "evaluated_at": self.evaluated_at.isoformat(), + "overall_score": round(self.overall_score, 4), + "scores": [s.to_dict() for s in self.scores], + "notes": self.notes, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "EntityEvaluation": + return cls( + entity_slug=data["entity_slug"], + evaluator=data["evaluator"], + scores=[ScoreEntry.from_dict(s) for s in data["scores"]], + evaluated_at=datetime.fromisoformat(data["evaluated_at"]), + notes=data.get("notes", []), + ) + + +@dataclass +class MetricValue: + """A single collection-level metric.""" + + name: str + value: float + concern: str = "" + details: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + d: Dict[str, Any] = {"name": self.name, "value": self.value} + if self.concern: + d["concern"] = self.concern + if self.details: + d["details"] = self.details + return d + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "MetricValue": + return cls( + name=data["name"], + value=float(data["value"]), + concern=data.get("concern", ""), + details=data.get("details", {}), + ) + + +@dataclass +class EvaluationSnapshot: + """Timestamped snapshot of entity evaluations and collection metrics.""" + + snapshot_id: str + created_at: datetime + schema_name: str + entity_count: int + entity_evaluations: List[EntityEvaluation] = field(default_factory=list) + collection_metrics: List[MetricValue] = field(default_factory=list) + metadata: Dict[str, Any] = field(default_factory=dict) + + def to_dict(self) -> Dict[str, Any]: + return { + "snapshot_id": self.snapshot_id, + "created_at": self.created_at.isoformat(), + "schema_name": self.schema_name, + "entity_count": self.entity_count, + "entity_evaluations": [e.to_dict() for e in self.entity_evaluations], + "collection_metrics": [m.to_dict() for m in self.collection_metrics], + "metadata": self.metadata, + } + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "EvaluationSnapshot": + return cls( + snapshot_id=data["snapshot_id"], + created_at=datetime.fromisoformat(data["created_at"]), + schema_name=data["schema_name"], + entity_count=data["entity_count"], + entity_evaluations=[ + EntityEvaluation.from_dict(e) for e in data.get("entity_evaluations", []) + ], + collection_metrics=[ + MetricValue.from_dict(m) for m in data.get("collection_metrics", []) + ], + metadata=data.get("metadata", {}), + ) + + +@dataclass +class ScoreChange: + """Delta record for a single score dimension between snapshots.""" + + entity_slug: str + dimension: str + before: float + after: float + + @property + def delta(self) -> float: + return self.after - self.before + + +@dataclass +class MetricChange: + """Delta record for a collection metric between snapshots.""" + + name: str + before: float + after: float + + @property + def delta(self) -> float: + return self.after - self.before + + +@dataclass +class SnapshotDiff: + """Diff between two evaluation snapshots.""" + + before_id: str + after_id: str + added_entities: List[str] = field(default_factory=list) + removed_entities: List[str] = field(default_factory=list) + score_changes: List[ScoreChange] = field(default_factory=list) + metric_changes: List[MetricChange] = field(default_factory=list) + + def summary(self) -> str: + lines = [f"Diff: {self.before_id} -> {self.after_id}"] + if self.added_entities: + lines.append(f" Added entities: {', '.join(self.added_entities)}") + if self.removed_entities: + lines.append(f" Removed entities: {', '.join(self.removed_entities)}") + if self.score_changes: + lines.append(f" Score changes: {len(self.score_changes)}") + for sc in self.score_changes: + lines.append( + f" {sc.entity_slug}/{sc.dimension}: " + f"{sc.before} -> {sc.after} ({sc.delta:+.2f})" + ) + if self.metric_changes: + lines.append(f" Metric changes: {len(self.metric_changes)}") + for mc in self.metric_changes: + lines.append( + f" {mc.name}: {mc.before} -> {mc.after} ({mc.delta:+.2f})" + ) + if not any([self.added_entities, self.removed_entities, + self.score_changes, self.metric_changes]): + lines.append(" No changes") + return "\n".join(lines) diff --git a/markitect/infospace/evaluation_io.py b/markitect/infospace/evaluation_io.py new file mode 100644 index 00000000..5d834704 --- /dev/null +++ b/markitect/infospace/evaluation_io.py @@ -0,0 +1,213 @@ +""" +Read/write utilities for evaluation output files. + +Per-entity evaluations are stored as markdown with YAML frontmatter. +Snapshots and history are stored as pure YAML files. +""" + +from pathlib import Path +from typing import List + +import yaml + +from .evaluation import ( + EntityEvaluation, + EvaluationSnapshot, + MetricChange, + MetricValue, + ScoreChange, + SnapshotDiff, +) + +_FRONTMATTER_SEP = "---" + + +def write_entity_evaluation(evaluation: EntityEvaluation, path: Path) -> None: + """Write a per-entity evaluation as YAML frontmatter + markdown body.""" + frontmatter = { + "entity_slug": evaluation.entity_slug, + "evaluator": evaluation.evaluator, + "evaluated_at": evaluation.evaluated_at.isoformat(), + "overall_score": round(evaluation.overall_score, 4), + "scores": [s.to_dict() for s in evaluation.scores], + } + if evaluation.notes: + frontmatter["notes"] = evaluation.notes + + lines: List[str] = [] + lines.append(_FRONTMATTER_SEP) + lines.append(yaml.safe_dump(frontmatter, default_flow_style=False, sort_keys=False).rstrip()) + lines.append(_FRONTMATTER_SEP) + lines.append("") + + # Title + title = evaluation.entity_slug.replace("_", " ").replace("-", " ").title() + lines.append(f"# Evaluation: {title}") + lines.append("") + + # One section per score with rationale + for score in evaluation.scores: + lines.append(f"## {score.name} — {score.value} / {score.max_value}") + lines.append("") + if score.rationale: + lines.append(score.rationale) + lines.append("") + + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("\n".join(lines), encoding="utf-8") + + +def read_entity_evaluation(path: Path) -> EntityEvaluation: + """Read a per-entity evaluation from a YAML frontmatter markdown file.""" + text = path.read_text(encoding="utf-8") + parts = text.split(f"{_FRONTMATTER_SEP}\n", maxsplit=2) + # parts: ["", frontmatter_text, body] + if len(parts) < 3: + raise ValueError(f"Invalid frontmatter in {path}") + fm_text = parts[1] + body = parts[2] + + fm = yaml.safe_load(fm_text) + + # Parse rationales from body + rationales = _parse_rationales(body) + + from .evaluation import ScoreEntry + + scores = [] + for s_data in fm["scores"]: + se = ScoreEntry.from_dict(s_data) + if se.name in rationales: + se.rationale = rationales[se.name] + scores.append(se) + + return EntityEvaluation( + entity_slug=fm["entity_slug"], + evaluator=fm["evaluator"], + scores=scores, + evaluated_at=__import__("datetime").datetime.fromisoformat(fm["evaluated_at"]), + notes=fm.get("notes", []), + ) + + +def _parse_rationales(body: str) -> dict: + """Extract rationale text per dimension from the markdown body.""" + rationales: dict = {} + current_name = None + current_lines: List[str] = [] + + for line in body.splitlines(): + if line.startswith("## "): + # Save previous + if current_name is not None: + rationales[current_name] = "\n".join(current_lines).strip() + # Parse "## dimension_name — 4.5 / 5.0" + heading = line[3:].strip() + name = heading.split("—")[0].strip() if "—" in heading else heading + current_name = name + current_lines = [] + elif current_name is not None: + current_lines.append(line) + + if current_name is not None: + rationales[current_name] = "\n".join(current_lines).strip() + + return rationales + + +def write_snapshot(snapshot: EvaluationSnapshot, path: Path) -> None: + """Write an evaluation snapshot as a YAML file.""" + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text( + yaml.safe_dump(snapshot.to_dict(), default_flow_style=False, sort_keys=False), + encoding="utf-8", + ) + + +def read_snapshot(path: Path) -> EvaluationSnapshot: + """Read an evaluation snapshot from a YAML file.""" + data = yaml.safe_load(path.read_text(encoding="utf-8")) + return EvaluationSnapshot.from_dict(data) + + +def append_to_history(snapshot: EvaluationSnapshot, history_path: Path) -> None: + """Append a snapshot to a YAML list file (creates if missing).""" + history_path.parent.mkdir(parents=True, exist_ok=True) + existing: List[dict] = [] + if history_path.exists(): + loaded = yaml.safe_load(history_path.read_text(encoding="utf-8")) + if loaded is not None: + existing = loaded + + existing.append(snapshot.to_dict()) + history_path.write_text( + yaml.safe_dump(existing, default_flow_style=False, sort_keys=False), + encoding="utf-8", + ) + + +def read_history(history_path: Path) -> List[EvaluationSnapshot]: + """Read all snapshots from a YAML history file.""" + data = yaml.safe_load(history_path.read_text(encoding="utf-8")) + if data is None: + return [] + return [EvaluationSnapshot.from_dict(d) for d in data] + + +def diff_snapshots(before: EvaluationSnapshot, after: EvaluationSnapshot) -> SnapshotDiff: + """Compute the diff between two evaluation snapshots.""" + before_slugs = {e.entity_slug for e in before.entity_evaluations} + after_slugs = {e.entity_slug for e in after.entity_evaluations} + + added = sorted(after_slugs - before_slugs) + removed = sorted(before_slugs - after_slugs) + + # Build score lookup: {slug: {dimension: value}} + before_scores: dict = {} + for ev in before.entity_evaluations: + before_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores} + + after_scores: dict = {} + for ev in after.entity_evaluations: + after_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores} + + score_changes: List[ScoreChange] = [] + common_slugs = sorted(before_slugs & after_slugs) + for slug in common_slugs: + b_dims = before_scores[slug] + a_dims = after_scores[slug] + all_dims = sorted(set(b_dims) | set(a_dims)) + for dim in all_dims: + bv = b_dims.get(dim) + av = a_dims.get(dim) + if bv != av: + score_changes.append(ScoreChange( + entity_slug=slug, + dimension=dim, + before=bv if bv is not None else 0.0, + after=av if av is not None else 0.0, + )) + + # Metric changes + before_metrics = {m.name: m.value for m in before.collection_metrics} + after_metrics = {m.name: m.value for m in after.collection_metrics} + all_metric_names = sorted(set(before_metrics) | set(after_metrics)) + metric_changes: List[MetricChange] = [] + for name in all_metric_names: + bv = before_metrics.get(name) + av = after_metrics.get(name) + if bv != av: + metric_changes.append(MetricChange( + name=name, + before=bv if bv is not None else 0.0, + after=av if av is not None else 0.0, + )) + + return SnapshotDiff( + before_id=before.snapshot_id, + after_id=after.snapshot_id, + added_entities=added, + removed_entities=removed, + score_changes=score_changes, + metric_changes=metric_changes, + ) diff --git a/tests/unit/infospace/test_evaluation.py b/tests/unit/infospace/test_evaluation.py new file mode 100644 index 00000000..27f4cd48 --- /dev/null +++ b/tests/unit/infospace/test_evaluation.py @@ -0,0 +1,398 @@ +"""Tests for markitect.infospace evaluation models and I/O.""" + +from datetime import datetime +from pathlib import Path + +import pytest + +from markitect.infospace import ( + EntityEvaluation, + EvaluationSnapshot, + MetricChange, + MetricValue, + ScoreChange, + ScoreEntry, + SnapshotDiff, + append_to_history, + diff_snapshots, + read_entity_evaluation, + read_history, + read_snapshot, + write_entity_evaluation, + write_snapshot, +) + + +# ── Helpers ────────────────────────────────────────────────────────── + +_NOW = datetime(2026, 2, 19, 12, 0, 0) + + +def _sample_scores() -> list: + return [ + ScoreEntry("definition_precision", 4.5, rationale="Clear and specific."), + ScoreEntry("source_grounding", 4.0, rationale="Well grounded."), + ScoreEntry("domain_relevance", 4.5), + ] + + +def _sample_evaluation(**overrides) -> EntityEvaluation: + defaults = dict( + entity_slug="division-of-labour", + evaluator="openrouter/anthropic/claude-3.5-sonnet", + scores=_sample_scores(), + evaluated_at=_NOW, + notes=["Strong entity with clear provenance"], + ) + defaults.update(overrides) + return EntityEvaluation(**defaults) + + +def _sample_metric() -> MetricValue: + return MetricValue("coverage_ratio", 0.85, concern="C2", details={"checked": 85}) + + +def _sample_snapshot(**overrides) -> EvaluationSnapshot: + defaults = dict( + snapshot_id="2026-02-19", + created_at=_NOW, + schema_name="Economic Entity", + entity_count=1, + entity_evaluations=[_sample_evaluation()], + collection_metrics=[_sample_metric()], + metadata={"version": "1.0"}, + ) + defaults.update(overrides) + return EvaluationSnapshot(**defaults) + + +# ── Model tests ────────────────────────────────────────────────────── + + +class TestScoreEntry: + def test_to_dict_from_dict_round_trip(self): + se = ScoreEntry("precision", 4.5, 5.0, "Good definition.") + d = se.to_dict() + restored = ScoreEntry.from_dict(d) + assert restored.name == se.name + assert restored.value == se.value + assert restored.max_value == se.max_value + assert restored.rationale == se.rationale + + def test_to_dict_omits_empty_rationale(self): + se = ScoreEntry("precision", 4.5) + d = se.to_dict() + assert "rationale" not in d + + def test_from_dict_defaults(self): + se = ScoreEntry.from_dict({"name": "x", "value": 3.0}) + assert se.max_value == 5.0 + assert se.rationale == "" + + +class TestEntityEvaluation: + def test_overall_score_is_mean(self): + ev = _sample_evaluation() + # (4.5 + 4.0 + 4.5) / 3 ≈ 4.333 + assert abs(ev.overall_score - 4.333333) < 0.001 + + def test_overall_score_zero_scores(self): + ev = _sample_evaluation(scores=[]) + assert ev.overall_score == 0.0 + + def test_to_dict_from_dict_round_trip(self): + ev = _sample_evaluation() + d = ev.to_dict() + restored = EntityEvaluation.from_dict(d) + assert restored.entity_slug == ev.entity_slug + assert restored.evaluator == ev.evaluator + assert len(restored.scores) == len(ev.scores) + assert restored.evaluated_at == ev.evaluated_at + assert restored.notes == ev.notes + + def test_to_dict_includes_overall_score(self): + ev = _sample_evaluation() + d = ev.to_dict() + assert "overall_score" in d + assert abs(d["overall_score"] - 4.3333) < 0.01 + + +class TestMetricValue: + def test_to_dict_from_dict_round_trip(self): + mv = _sample_metric() + d = mv.to_dict() + restored = MetricValue.from_dict(d) + assert restored.name == mv.name + assert restored.value == mv.value + assert restored.concern == mv.concern + assert restored.details == mv.details + + def test_to_dict_omits_empty_concern(self): + mv = MetricValue("x", 1.0) + d = mv.to_dict() + assert "concern" not in d + assert "details" not in d + + +class TestEvaluationSnapshot: + def test_to_dict_from_dict_round_trip(self): + snap = _sample_snapshot() + d = snap.to_dict() + restored = EvaluationSnapshot.from_dict(d) + assert restored.snapshot_id == snap.snapshot_id + assert restored.created_at == snap.created_at + assert restored.schema_name == snap.schema_name + assert restored.entity_count == snap.entity_count + assert len(restored.entity_evaluations) == 1 + assert len(restored.collection_metrics) == 1 + assert restored.metadata == snap.metadata + + def test_from_dict_empty_lists(self): + d = { + "snapshot_id": "test", + "created_at": _NOW.isoformat(), + "schema_name": "Test", + "entity_count": 0, + } + snap = EvaluationSnapshot.from_dict(d) + assert snap.entity_evaluations == [] + assert snap.collection_metrics == [] + assert snap.metadata == {} + + +# ── Per-entity file I/O ────────────────────────────────────────────── + + +class TestEntityEvaluationIO: + def test_write_creates_file(self, tmp_path): + ev = _sample_evaluation() + p = tmp_path / "eval.md" + write_entity_evaluation(ev, p) + assert p.exists() + + def test_file_has_yaml_frontmatter(self, tmp_path): + ev = _sample_evaluation() + p = tmp_path / "eval.md" + write_entity_evaluation(ev, p) + text = p.read_text() + assert text.startswith("---\n") + assert "\n---\n" in text + + def test_frontmatter_contains_expected_keys(self, tmp_path): + ev = _sample_evaluation() + p = tmp_path / "eval.md" + write_entity_evaluation(ev, p) + text = p.read_text() + for key in ["entity_slug", "evaluator", "evaluated_at", "overall_score", "scores"]: + assert key in text + + def test_markdown_body_contains_rationales(self, tmp_path): + ev = _sample_evaluation() + p = tmp_path / "eval.md" + write_entity_evaluation(ev, p) + text = p.read_text() + assert "Clear and specific." in text + assert "Well grounded." in text + assert "## definition_precision" in text + + def test_read_back_matches_original(self, tmp_path): + ev = _sample_evaluation() + p = tmp_path / "eval.md" + write_entity_evaluation(ev, p) + restored = read_entity_evaluation(p) + assert restored.entity_slug == ev.entity_slug + assert restored.evaluator == ev.evaluator + assert restored.evaluated_at == ev.evaluated_at + assert restored.notes == ev.notes + assert len(restored.scores) == len(ev.scores) + + def test_round_trip_preserves_scores(self, tmp_path): + ev = _sample_evaluation() + p = tmp_path / "eval.md" + write_entity_evaluation(ev, p) + restored = read_entity_evaluation(p) + for orig, rest in zip(ev.scores, restored.scores): + assert rest.name == orig.name + assert rest.value == orig.value + assert rest.max_value == orig.max_value + + def test_round_trip_preserves_rationales(self, tmp_path): + ev = _sample_evaluation() + p = tmp_path / "eval.md" + write_entity_evaluation(ev, p) + restored = read_entity_evaluation(p) + assert restored.scores[0].rationale == "Clear and specific." + assert restored.scores[1].rationale == "Well grounded." + # Third score has no rationale + assert restored.scores[2].rationale == "" + + def test_write_creates_parent_dirs(self, tmp_path): + ev = _sample_evaluation() + p = tmp_path / "deep" / "nested" / "eval.md" + write_entity_evaluation(ev, p) + assert p.exists() + + +# ── Snapshot I/O ───────────────────────────────────────────────────── + + +class TestSnapshotIO: + def test_write_creates_file(self, tmp_path): + snap = _sample_snapshot() + p = tmp_path / "snapshot.yaml" + write_snapshot(snap, p) + assert p.exists() + + def test_read_back_matches_original(self, tmp_path): + snap = _sample_snapshot() + p = tmp_path / "snapshot.yaml" + write_snapshot(snap, p) + restored = read_snapshot(p) + assert restored.snapshot_id == snap.snapshot_id + assert restored.created_at == snap.created_at + assert restored.schema_name == snap.schema_name + assert restored.entity_count == snap.entity_count + + def test_round_trip_preserves_entity_evaluations(self, tmp_path): + snap = _sample_snapshot() + p = tmp_path / "snapshot.yaml" + write_snapshot(snap, p) + restored = read_snapshot(p) + assert len(restored.entity_evaluations) == 1 + ev = restored.entity_evaluations[0] + assert ev.entity_slug == "division-of-labour" + assert len(ev.scores) == 3 + + def test_round_trip_preserves_collection_metrics(self, tmp_path): + snap = _sample_snapshot() + p = tmp_path / "snapshot.yaml" + write_snapshot(snap, p) + restored = read_snapshot(p) + assert len(restored.collection_metrics) == 1 + m = restored.collection_metrics[0] + assert m.name == "coverage_ratio" + assert m.value == 0.85 + assert m.concern == "C2" + + +# ── History ────────────────────────────────────────────────────────── + + +class TestHistory: + def test_append_creates_new_file(self, tmp_path): + snap = _sample_snapshot() + hp = tmp_path / "history.yaml" + append_to_history(snap, hp) + assert hp.exists() + history = read_history(hp) + assert len(history) == 1 + + def test_append_adds_to_existing(self, tmp_path): + hp = tmp_path / "history.yaml" + snap1 = _sample_snapshot(snapshot_id="snap-1") + snap2 = _sample_snapshot(snapshot_id="snap-2") + append_to_history(snap1, hp) + append_to_history(snap2, hp) + history = read_history(hp) + assert len(history) == 2 + assert history[0].snapshot_id == "snap-1" + assert history[1].snapshot_id == "snap-2" + + def test_multiple_appends_all_preserved(self, tmp_path): + hp = tmp_path / "history.yaml" + for i in range(5): + snap = _sample_snapshot(snapshot_id=f"snap-{i}") + append_to_history(snap, hp) + history = read_history(hp) + assert len(history) == 5 + assert [h.snapshot_id for h in history] == [f"snap-{i}" for i in range(5)] + + def test_read_history_returns_list_in_order(self, tmp_path): + hp = tmp_path / "history.yaml" + snap_a = _sample_snapshot(snapshot_id="a") + snap_b = _sample_snapshot(snapshot_id="b") + append_to_history(snap_a, hp) + append_to_history(snap_b, hp) + history = read_history(hp) + assert history[0].snapshot_id == "a" + assert history[1].snapshot_id == "b" + + +# ── Diffing ────────────────────────────────────────────────────────── + + +class TestDiffSnapshots: + def test_identical_snapshots_empty_diff(self): + snap = _sample_snapshot() + diff = diff_snapshots(snap, snap) + assert diff.added_entities == [] + assert diff.removed_entities == [] + assert diff.score_changes == [] + assert diff.metric_changes == [] + + def test_added_entity(self): + before = _sample_snapshot(entity_evaluations=[]) + after = _sample_snapshot() + diff = diff_snapshots(before, after) + assert "division-of-labour" in diff.added_entities + assert diff.removed_entities == [] + + def test_removed_entity(self): + before = _sample_snapshot() + after = _sample_snapshot(entity_evaluations=[]) + diff = diff_snapshots(before, after) + assert "division-of-labour" in diff.removed_entities + assert diff.added_entities == [] + + def test_changed_score(self): + ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)]) + ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)]) + before = _sample_snapshot(entity_evaluations=[ev_before]) + after = _sample_snapshot(entity_evaluations=[ev_after]) + diff = diff_snapshots(before, after) + assert len(diff.score_changes) == 1 + sc = diff.score_changes[0] + assert sc.entity_slug == "division-of-labour" + assert sc.dimension == "precision" + assert sc.before == 4.0 + assert sc.after == 4.8 + + def test_changed_metric(self): + before = _sample_snapshot( + collection_metrics=[MetricValue("coverage_ratio", 0.80)] + ) + after = _sample_snapshot( + collection_metrics=[MetricValue("coverage_ratio", 0.90)] + ) + diff = diff_snapshots(before, after) + assert len(diff.metric_changes) == 1 + mc = diff.metric_changes[0] + assert mc.name == "coverage_ratio" + assert mc.before == 0.80 + assert mc.after == 0.90 + + def test_summary_readable(self): + ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)]) + ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)]) + before = _sample_snapshot( + snapshot_id="snap-1", + entity_evaluations=[ev_before], + collection_metrics=[MetricValue("coverage", 0.80)], + ) + after = _sample_snapshot( + snapshot_id="snap-2", + entity_evaluations=[ev_after], + collection_metrics=[MetricValue("coverage", 0.90)], + ) + diff = diff_snapshots(before, after) + text = diff.summary() + assert "snap-1" in text + assert "snap-2" in text + assert "precision" in text + assert "coverage" in text + + def test_summary_no_changes(self): + snap = _sample_snapshot() + diff = diff_snapshots(snap, snap) + text = diff.summary() + assert "No changes" in text