feat(infospace): add structured evaluation output with history and diffing (S1.5)

Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot, SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files, snapshot persistence, history append, and snapshot diffing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:35:22 +01:00
parent bad01e32bd
commit f8c9ab33f0
4 changed files with 852 additions and 0 deletions
--- a/markitect/infospace/init.py
+++ b/markitect/infospace/init.py
@@ -21,6 +21,24 @@ from .validator import (
    validate_entities,
    validate_entity,
 )
+from .evaluation import (
+    EntityEvaluation,
+    EvaluationSnapshot,
+    MetricChange,
+    MetricValue,
+    ScoreChange,
+    ScoreEntry,
+    SnapshotDiff,
+)
+from .evaluation_io import (
+    append_to_history,
+    diff_snapshots,
+    read_entity_evaluation,
+    read_history,
+    read_snapshot,
+    write_entity_evaluation,
+    write_snapshot,
+)

 __all__ = [
    "EntityMeta",
@@ -38,4 +56,20 @@ __all__ = [
    "ComplianceResult",
    "validate_entities",
    "validate_entity",
+    # Evaluation models
+    "EntityEvaluation",
+    "EvaluationSnapshot",
+    "MetricChange",
+    "MetricValue",
+    "ScoreChange",
+    "ScoreEntry",
+    "SnapshotDiff",
+    # Evaluation I/O
+    "append_to_history",
+    "diff_snapshots",
+    "read_entity_evaluation",
+    "read_history",
+    "read_snapshot",
+    "write_entity_evaluation",
+    "write_snapshot",
 ]
--- a/markitect/infospace/evaluation.py
+++ b/markitect/infospace/evaluation.py
@@ -0,0 +1,207 @@
+"""
+Data models for structured evaluation output.
+
+Provides typed containers for per-entity LLM-evaluated scores and
+collection-level metrics.  All models support ``to_dict()``/``from_dict()``
+round-tripping for YAML serialisation.
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, Dict, List, Optional
+
+
+@dataclass
+class ScoreEntry:
+    """A single scored dimension (e.g. definition_precision: 4.5/5.0)."""
+
+    name: str
+    value: float
+    max_value: float = 5.0
+    rationale: str = ""
+
+    def to_dict(self) -> Dict[str, Any]:
+        d: Dict[str, Any] = {
+            "name": self.name,
+            "value": self.value,
+            "max_value": self.max_value,
+        }
+        if self.rationale:
+            d["rationale"] = self.rationale
+        return d
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "ScoreEntry":
+        return cls(
+            name=data["name"],
+            value=float(data["value"]),
+            max_value=float(data.get("max_value", 5.0)),
+            rationale=data.get("rationale", ""),
+        )
+
+
+@dataclass
+class EntityEvaluation:
+    """Per-entity evaluation result."""
+
+    entity_slug: str
+    evaluator: str
+    scores: List[ScoreEntry]
+    evaluated_at: datetime
+    notes: List[str] = field(default_factory=list)
+
+    @property
+    def overall_score(self) -> float:
+        if not self.scores:
+            return 0.0
+        return sum(s.value for s in self.scores) / len(self.scores)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "entity_slug": self.entity_slug,
+            "evaluator": self.evaluator,
+            "evaluated_at": self.evaluated_at.isoformat(),
+            "overall_score": round(self.overall_score, 4),
+            "scores": [s.to_dict() for s in self.scores],
+            "notes": self.notes,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EntityEvaluation":
+        return cls(
+            entity_slug=data["entity_slug"],
+            evaluator=data["evaluator"],
+            scores=[ScoreEntry.from_dict(s) for s in data["scores"]],
+            evaluated_at=datetime.fromisoformat(data["evaluated_at"]),
+            notes=data.get("notes", []),
+        )
+
+
+@dataclass
+class MetricValue:
+    """A single collection-level metric."""
+
+    name: str
+    value: float
+    concern: str = ""
+    details: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        d: Dict[str, Any] = {"name": self.name, "value": self.value}
+        if self.concern:
+            d["concern"] = self.concern
+        if self.details:
+            d["details"] = self.details
+        return d
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "MetricValue":
+        return cls(
+            name=data["name"],
+            value=float(data["value"]),
+            concern=data.get("concern", ""),
+            details=data.get("details", {}),
+        )
+
+
+@dataclass
+class EvaluationSnapshot:
+    """Timestamped snapshot of entity evaluations and collection metrics."""
+
+    snapshot_id: str
+    created_at: datetime
+    schema_name: str
+    entity_count: int
+    entity_evaluations: List[EntityEvaluation] = field(default_factory=list)
+    collection_metrics: List[MetricValue] = field(default_factory=list)
+    metadata: Dict[str, Any] = field(default_factory=dict)
+
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "snapshot_id": self.snapshot_id,
+            "created_at": self.created_at.isoformat(),
+            "schema_name": self.schema_name,
+            "entity_count": self.entity_count,
+            "entity_evaluations": [e.to_dict() for e in self.entity_evaluations],
+            "collection_metrics": [m.to_dict() for m in self.collection_metrics],
+            "metadata": self.metadata,
+        }
+
+    @classmethod
+    def from_dict(cls, data: Dict[str, Any]) -> "EvaluationSnapshot":
+        return cls(
+            snapshot_id=data["snapshot_id"],
+            created_at=datetime.fromisoformat(data["created_at"]),
+            schema_name=data["schema_name"],
+            entity_count=data["entity_count"],
+            entity_evaluations=[
+                EntityEvaluation.from_dict(e) for e in data.get("entity_evaluations", [])
+            ],
+            collection_metrics=[
+                MetricValue.from_dict(m) for m in data.get("collection_metrics", [])
+            ],
+            metadata=data.get("metadata", {}),
+        )
+
+
+@dataclass
+class ScoreChange:
+    """Delta record for a single score dimension between snapshots."""
+
+    entity_slug: str
+    dimension: str
+    before: float
+    after: float
+
+    @property
+    def delta(self) -> float:
+        return self.after - self.before
+
+
+@dataclass
+class MetricChange:
+    """Delta record for a collection metric between snapshots."""
+
+    name: str
+    before: float
+    after: float
+
+    @property
+    def delta(self) -> float:
+        return self.after - self.before
+
+
+@dataclass
+class SnapshotDiff:
+    """Diff between two evaluation snapshots."""
+
+    before_id: str
+    after_id: str
+    added_entities: List[str] = field(default_factory=list)
+    removed_entities: List[str] = field(default_factory=list)
+    score_changes: List[ScoreChange] = field(default_factory=list)
+    metric_changes: List[MetricChange] = field(default_factory=list)
+
+    def summary(self) -> str:
+        lines = [f"Diff: {self.before_id} -> {self.after_id}"]
+        if self.added_entities:
+            lines.append(f"  Added entities: {', '.join(self.added_entities)}")
+        if self.removed_entities:
+            lines.append(f"  Removed entities: {', '.join(self.removed_entities)}")
+        if self.score_changes:
+            lines.append(f"  Score changes: {len(self.score_changes)}")
+            for sc in self.score_changes:
+                lines.append(
+                    f"    {sc.entity_slug}/{sc.dimension}: "
+                    f"{sc.before} -> {sc.after} ({sc.delta:+.2f})"
+                )
+        if self.metric_changes:
+            lines.append(f"  Metric changes: {len(self.metric_changes)}")
+            for mc in self.metric_changes:
+                lines.append(
+                    f"    {mc.name}: {mc.before} -> {mc.after} ({mc.delta:+.2f})"
+                )
+        if not any([self.added_entities, self.removed_entities,
+                     self.score_changes, self.metric_changes]):
+            lines.append("  No changes")
+        return "\n".join(lines)
--- a/markitect/infospace/evaluation_io.py
+++ b/markitect/infospace/evaluation_io.py
@@ -0,0 +1,213 @@
+"""
+Read/write utilities for evaluation output files.
+
+Per-entity evaluations are stored as markdown with YAML frontmatter.
+Snapshots and history are stored as pure YAML files.
+"""
+
+from pathlib import Path
+from typing import List
+
+import yaml
+
+from .evaluation import (
+    EntityEvaluation,
+    EvaluationSnapshot,
+    MetricChange,
+    MetricValue,
+    ScoreChange,
+    SnapshotDiff,
+)
+
+_FRONTMATTER_SEP = "---"
+
+
+def write_entity_evaluation(evaluation: EntityEvaluation, path: Path) -> None:
+    """Write a per-entity evaluation as YAML frontmatter + markdown body."""
+    frontmatter = {
+        "entity_slug": evaluation.entity_slug,
+        "evaluator": evaluation.evaluator,
+        "evaluated_at": evaluation.evaluated_at.isoformat(),
+        "overall_score": round(evaluation.overall_score, 4),
+        "scores": [s.to_dict() for s in evaluation.scores],
+    }
+    if evaluation.notes:
+        frontmatter["notes"] = evaluation.notes
+
+    lines: List[str] = []
+    lines.append(_FRONTMATTER_SEP)
+    lines.append(yaml.safe_dump(frontmatter, default_flow_style=False, sort_keys=False).rstrip())
+    lines.append(_FRONTMATTER_SEP)
+    lines.append("")
+
+    # Title
+    title = evaluation.entity_slug.replace("_", " ").replace("-", " ").title()
+    lines.append(f"# Evaluation: {title}")
+    lines.append("")
+
+    # One section per score with rationale
+    for score in evaluation.scores:
+        lines.append(f"## {score.name} — {score.value} / {score.max_value}")
+        lines.append("")
+        if score.rationale:
+            lines.append(score.rationale)
+            lines.append("")
+
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text("\n".join(lines), encoding="utf-8")
+
+
+def read_entity_evaluation(path: Path) -> EntityEvaluation:
+    """Read a per-entity evaluation from a YAML frontmatter markdown file."""
+    text = path.read_text(encoding="utf-8")
+    parts = text.split(f"{_FRONTMATTER_SEP}\n", maxsplit=2)
+    # parts: ["", frontmatter_text, body]
+    if len(parts) < 3:
+        raise ValueError(f"Invalid frontmatter in {path}")
+    fm_text = parts[1]
+    body = parts[2]
+
+    fm = yaml.safe_load(fm_text)
+
+    # Parse rationales from body
+    rationales = _parse_rationales(body)
+
+    from .evaluation import ScoreEntry
+
+    scores = []
+    for s_data in fm["scores"]:
+        se = ScoreEntry.from_dict(s_data)
+        if se.name in rationales:
+            se.rationale = rationales[se.name]
+        scores.append(se)
+
+    return EntityEvaluation(
+        entity_slug=fm["entity_slug"],
+        evaluator=fm["evaluator"],
+        scores=scores,
+        evaluated_at=__import__("datetime").datetime.fromisoformat(fm["evaluated_at"]),
+        notes=fm.get("notes", []),
+    )
+
+
+def _parse_rationales(body: str) -> dict:
+    """Extract rationale text per dimension from the markdown body."""
+    rationales: dict = {}
+    current_name = None
+    current_lines: List[str] = []
+
+    for line in body.splitlines():
+        if line.startswith("## "):
+            # Save previous
+            if current_name is not None:
+                rationales[current_name] = "\n".join(current_lines).strip()
+            # Parse "## dimension_name — 4.5 / 5.0"
+            heading = line[3:].strip()
+            name = heading.split("—")[0].strip() if "—" in heading else heading
+            current_name = name
+            current_lines = []
+        elif current_name is not None:
+            current_lines.append(line)
+
+    if current_name is not None:
+        rationales[current_name] = "\n".join(current_lines).strip()
+
+    return rationales
+
+
+def write_snapshot(snapshot: EvaluationSnapshot, path: Path) -> None:
+    """Write an evaluation snapshot as a YAML file."""
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(
+        yaml.safe_dump(snapshot.to_dict(), default_flow_style=False, sort_keys=False),
+        encoding="utf-8",
+    )
+
+
+def read_snapshot(path: Path) -> EvaluationSnapshot:
+    """Read an evaluation snapshot from a YAML file."""
+    data = yaml.safe_load(path.read_text(encoding="utf-8"))
+    return EvaluationSnapshot.from_dict(data)
+
+
+def append_to_history(snapshot: EvaluationSnapshot, history_path: Path) -> None:
+    """Append a snapshot to a YAML list file (creates if missing)."""
+    history_path.parent.mkdir(parents=True, exist_ok=True)
+    existing: List[dict] = []
+    if history_path.exists():
+        loaded = yaml.safe_load(history_path.read_text(encoding="utf-8"))
+        if loaded is not None:
+            existing = loaded
+
+    existing.append(snapshot.to_dict())
+    history_path.write_text(
+        yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
+        encoding="utf-8",
+    )
+
+
+def read_history(history_path: Path) -> List[EvaluationSnapshot]:
+    """Read all snapshots from a YAML history file."""
+    data = yaml.safe_load(history_path.read_text(encoding="utf-8"))
+    if data is None:
+        return []
+    return [EvaluationSnapshot.from_dict(d) for d in data]
+
+
+def diff_snapshots(before: EvaluationSnapshot, after: EvaluationSnapshot) -> SnapshotDiff:
+    """Compute the diff between two evaluation snapshots."""
+    before_slugs = {e.entity_slug for e in before.entity_evaluations}
+    after_slugs = {e.entity_slug for e in after.entity_evaluations}
+
+    added = sorted(after_slugs - before_slugs)
+    removed = sorted(before_slugs - after_slugs)
+
+    # Build score lookup: {slug: {dimension: value}}
+    before_scores: dict = {}
+    for ev in before.entity_evaluations:
+        before_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores}
+
+    after_scores: dict = {}
+    for ev in after.entity_evaluations:
+        after_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores}
+
+    score_changes: List[ScoreChange] = []
+    common_slugs = sorted(before_slugs & after_slugs)
+    for slug in common_slugs:
+        b_dims = before_scores[slug]
+        a_dims = after_scores[slug]
+        all_dims = sorted(set(b_dims) | set(a_dims))
+        for dim in all_dims:
+            bv = b_dims.get(dim)
+            av = a_dims.get(dim)
+            if bv != av:
+                score_changes.append(ScoreChange(
+                    entity_slug=slug,
+                    dimension=dim,
+                    before=bv if bv is not None else 0.0,
+                    after=av if av is not None else 0.0,
+                ))
+
+    # Metric changes
+    before_metrics = {m.name: m.value for m in before.collection_metrics}
+    after_metrics = {m.name: m.value for m in after.collection_metrics}
+    all_metric_names = sorted(set(before_metrics) | set(after_metrics))
+    metric_changes: List[MetricChange] = []
+    for name in all_metric_names:
+        bv = before_metrics.get(name)
+        av = after_metrics.get(name)
+        if bv != av:
+            metric_changes.append(MetricChange(
+                name=name,
+                before=bv if bv is not None else 0.0,
+                after=av if av is not None else 0.0,
+            ))
+
+    return SnapshotDiff(
+        before_id=before.snapshot_id,
+        after_id=after.snapshot_id,
+        added_entities=added,
+        removed_entities=removed,
+        score_changes=score_changes,
+        metric_changes=metric_changes,
+    )
--- a/tests/unit/infospace/test_evaluation.py
+++ b/tests/unit/infospace/test_evaluation.py
@@ -0,0 +1,398 @@
+"""Tests for markitect.infospace evaluation models and I/O."""
+
+from datetime import datetime
+from pathlib import Path
+
+import pytest
+
+from markitect.infospace import (
+    EntityEvaluation,
+    EvaluationSnapshot,
+    MetricChange,
+    MetricValue,
+    ScoreChange,
+    ScoreEntry,
+    SnapshotDiff,
+    append_to_history,
+    diff_snapshots,
+    read_entity_evaluation,
+    read_history,
+    read_snapshot,
+    write_entity_evaluation,
+    write_snapshot,
+)
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+
+_NOW = datetime(2026, 2, 19, 12, 0, 0)
+
+
+def _sample_scores() -> list:
+    return [
+        ScoreEntry("definition_precision", 4.5, rationale="Clear and specific."),
+        ScoreEntry("source_grounding", 4.0, rationale="Well grounded."),
+        ScoreEntry("domain_relevance", 4.5),
+    ]
+
+
+def _sample_evaluation(**overrides) -> EntityEvaluation:
+    defaults = dict(
+        entity_slug="division-of-labour",
+        evaluator="openrouter/anthropic/claude-3.5-sonnet",
+        scores=_sample_scores(),
+        evaluated_at=_NOW,
+        notes=["Strong entity with clear provenance"],
+    )
+    defaults.update(overrides)
+    return EntityEvaluation(**defaults)
+
+
+def _sample_metric() -> MetricValue:
+    return MetricValue("coverage_ratio", 0.85, concern="C2", details={"checked": 85})
+
+
+def _sample_snapshot(**overrides) -> EvaluationSnapshot:
+    defaults = dict(
+        snapshot_id="2026-02-19",
+        created_at=_NOW,
+        schema_name="Economic Entity",
+        entity_count=1,
+        entity_evaluations=[_sample_evaluation()],
+        collection_metrics=[_sample_metric()],
+        metadata={"version": "1.0"},
+    )
+    defaults.update(overrides)
+    return EvaluationSnapshot(**defaults)
+
+
+# ── Model tests ──────────────────────────────────────────────────────
+
+
+class TestScoreEntry:
+    def test_to_dict_from_dict_round_trip(self):
+        se = ScoreEntry("precision", 4.5, 5.0, "Good definition.")
+        d = se.to_dict()
+        restored = ScoreEntry.from_dict(d)
+        assert restored.name == se.name
+        assert restored.value == se.value
+        assert restored.max_value == se.max_value
+        assert restored.rationale == se.rationale
+
+    def test_to_dict_omits_empty_rationale(self):
+        se = ScoreEntry("precision", 4.5)
+        d = se.to_dict()
+        assert "rationale" not in d
+
+    def test_from_dict_defaults(self):
+        se = ScoreEntry.from_dict({"name": "x", "value": 3.0})
+        assert se.max_value == 5.0
+        assert se.rationale == ""
+
+
+class TestEntityEvaluation:
+    def test_overall_score_is_mean(self):
+        ev = _sample_evaluation()
+        # (4.5 + 4.0 + 4.5) / 3 ≈ 4.333
+        assert abs(ev.overall_score - 4.333333) < 0.001
+
+    def test_overall_score_zero_scores(self):
+        ev = _sample_evaluation(scores=[])
+        assert ev.overall_score == 0.0
+
+    def test_to_dict_from_dict_round_trip(self):
+        ev = _sample_evaluation()
+        d = ev.to_dict()
+        restored = EntityEvaluation.from_dict(d)
+        assert restored.entity_slug == ev.entity_slug
+        assert restored.evaluator == ev.evaluator
+        assert len(restored.scores) == len(ev.scores)
+        assert restored.evaluated_at == ev.evaluated_at
+        assert restored.notes == ev.notes
+
+    def test_to_dict_includes_overall_score(self):
+        ev = _sample_evaluation()
+        d = ev.to_dict()
+        assert "overall_score" in d
+        assert abs(d["overall_score"] - 4.3333) < 0.01
+
+
+class TestMetricValue:
+    def test_to_dict_from_dict_round_trip(self):
+        mv = _sample_metric()
+        d = mv.to_dict()
+        restored = MetricValue.from_dict(d)
+        assert restored.name == mv.name
+        assert restored.value == mv.value
+        assert restored.concern == mv.concern
+        assert restored.details == mv.details
+
+    def test_to_dict_omits_empty_concern(self):
+        mv = MetricValue("x", 1.0)
+        d = mv.to_dict()
+        assert "concern" not in d
+        assert "details" not in d
+
+
+class TestEvaluationSnapshot:
+    def test_to_dict_from_dict_round_trip(self):
+        snap = _sample_snapshot()
+        d = snap.to_dict()
+        restored = EvaluationSnapshot.from_dict(d)
+        assert restored.snapshot_id == snap.snapshot_id
+        assert restored.created_at == snap.created_at
+        assert restored.schema_name == snap.schema_name
+        assert restored.entity_count == snap.entity_count
+        assert len(restored.entity_evaluations) == 1
+        assert len(restored.collection_metrics) == 1
+        assert restored.metadata == snap.metadata
+
+    def test_from_dict_empty_lists(self):
+        d = {
+            "snapshot_id": "test",
+            "created_at": _NOW.isoformat(),
+            "schema_name": "Test",
+            "entity_count": 0,
+        }
+        snap = EvaluationSnapshot.from_dict(d)
+        assert snap.entity_evaluations == []
+        assert snap.collection_metrics == []
+        assert snap.metadata == {}
+
+
+# ── Per-entity file I/O ──────────────────────────────────────────────
+
+
+class TestEntityEvaluationIO:
+    def test_write_creates_file(self, tmp_path):
+        ev = _sample_evaluation()
+        p = tmp_path / "eval.md"
+        write_entity_evaluation(ev, p)
+        assert p.exists()
+
+    def test_file_has_yaml_frontmatter(self, tmp_path):
+        ev = _sample_evaluation()
+        p = tmp_path / "eval.md"
+        write_entity_evaluation(ev, p)
+        text = p.read_text()
+        assert text.startswith("---\n")
+        assert "\n---\n" in text
+
+    def test_frontmatter_contains_expected_keys(self, tmp_path):
+        ev = _sample_evaluation()
+        p = tmp_path / "eval.md"
+        write_entity_evaluation(ev, p)
+        text = p.read_text()
+        for key in ["entity_slug", "evaluator", "evaluated_at", "overall_score", "scores"]:
+            assert key in text
+
+    def test_markdown_body_contains_rationales(self, tmp_path):
+        ev = _sample_evaluation()
+        p = tmp_path / "eval.md"
+        write_entity_evaluation(ev, p)
+        text = p.read_text()
+        assert "Clear and specific." in text
+        assert "Well grounded." in text
+        assert "## definition_precision" in text
+
+    def test_read_back_matches_original(self, tmp_path):
+        ev = _sample_evaluation()
+        p = tmp_path / "eval.md"
+        write_entity_evaluation(ev, p)
+        restored = read_entity_evaluation(p)
+        assert restored.entity_slug == ev.entity_slug
+        assert restored.evaluator == ev.evaluator
+        assert restored.evaluated_at == ev.evaluated_at
+        assert restored.notes == ev.notes
+        assert len(restored.scores) == len(ev.scores)
+
+    def test_round_trip_preserves_scores(self, tmp_path):
+        ev = _sample_evaluation()
+        p = tmp_path / "eval.md"
+        write_entity_evaluation(ev, p)
+        restored = read_entity_evaluation(p)
+        for orig, rest in zip(ev.scores, restored.scores):
+            assert rest.name == orig.name
+            assert rest.value == orig.value
+            assert rest.max_value == orig.max_value
+
+    def test_round_trip_preserves_rationales(self, tmp_path):
+        ev = _sample_evaluation()
+        p = tmp_path / "eval.md"
+        write_entity_evaluation(ev, p)
+        restored = read_entity_evaluation(p)
+        assert restored.scores[0].rationale == "Clear and specific."
+        assert restored.scores[1].rationale == "Well grounded."
+        # Third score has no rationale
+        assert restored.scores[2].rationale == ""
+
+    def test_write_creates_parent_dirs(self, tmp_path):
+        ev = _sample_evaluation()
+        p = tmp_path / "deep" / "nested" / "eval.md"
+        write_entity_evaluation(ev, p)
+        assert p.exists()
+
+
+# ── Snapshot I/O ─────────────────────────────────────────────────────
+
+
+class TestSnapshotIO:
+    def test_write_creates_file(self, tmp_path):
+        snap = _sample_snapshot()
+        p = tmp_path / "snapshot.yaml"
+        write_snapshot(snap, p)
+        assert p.exists()
+
+    def test_read_back_matches_original(self, tmp_path):
+        snap = _sample_snapshot()
+        p = tmp_path / "snapshot.yaml"
+        write_snapshot(snap, p)
+        restored = read_snapshot(p)
+        assert restored.snapshot_id == snap.snapshot_id
+        assert restored.created_at == snap.created_at
+        assert restored.schema_name == snap.schema_name
+        assert restored.entity_count == snap.entity_count
+
+    def test_round_trip_preserves_entity_evaluations(self, tmp_path):
+        snap = _sample_snapshot()
+        p = tmp_path / "snapshot.yaml"
+        write_snapshot(snap, p)
+        restored = read_snapshot(p)
+        assert len(restored.entity_evaluations) == 1
+        ev = restored.entity_evaluations[0]
+        assert ev.entity_slug == "division-of-labour"
+        assert len(ev.scores) == 3
+
+    def test_round_trip_preserves_collection_metrics(self, tmp_path):
+        snap = _sample_snapshot()
+        p = tmp_path / "snapshot.yaml"
+        write_snapshot(snap, p)
+        restored = read_snapshot(p)
+        assert len(restored.collection_metrics) == 1
+        m = restored.collection_metrics[0]
+        assert m.name == "coverage_ratio"
+        assert m.value == 0.85
+        assert m.concern == "C2"
+
+
+# ── History ──────────────────────────────────────────────────────────
+
+
+class TestHistory:
+    def test_append_creates_new_file(self, tmp_path):
+        snap = _sample_snapshot()
+        hp = tmp_path / "history.yaml"
+        append_to_history(snap, hp)
+        assert hp.exists()
+        history = read_history(hp)
+        assert len(history) == 1
+
+    def test_append_adds_to_existing(self, tmp_path):
+        hp = tmp_path / "history.yaml"
+        snap1 = _sample_snapshot(snapshot_id="snap-1")
+        snap2 = _sample_snapshot(snapshot_id="snap-2")
+        append_to_history(snap1, hp)
+        append_to_history(snap2, hp)
+        history = read_history(hp)
+        assert len(history) == 2
+        assert history[0].snapshot_id == "snap-1"
+        assert history[1].snapshot_id == "snap-2"
+
+    def test_multiple_appends_all_preserved(self, tmp_path):
+        hp = tmp_path / "history.yaml"
+        for i in range(5):
+            snap = _sample_snapshot(snapshot_id=f"snap-{i}")
+            append_to_history(snap, hp)
+        history = read_history(hp)
+        assert len(history) == 5
+        assert [h.snapshot_id for h in history] == [f"snap-{i}" for i in range(5)]
+
+    def test_read_history_returns_list_in_order(self, tmp_path):
+        hp = tmp_path / "history.yaml"
+        snap_a = _sample_snapshot(snapshot_id="a")
+        snap_b = _sample_snapshot(snapshot_id="b")
+        append_to_history(snap_a, hp)
+        append_to_history(snap_b, hp)
+        history = read_history(hp)
+        assert history[0].snapshot_id == "a"
+        assert history[1].snapshot_id == "b"
+
+
+# ── Diffing ──────────────────────────────────────────────────────────
+
+
+class TestDiffSnapshots:
+    def test_identical_snapshots_empty_diff(self):
+        snap = _sample_snapshot()
+        diff = diff_snapshots(snap, snap)
+        assert diff.added_entities == []
+        assert diff.removed_entities == []
+        assert diff.score_changes == []
+        assert diff.metric_changes == []
+
+    def test_added_entity(self):
+        before = _sample_snapshot(entity_evaluations=[])
+        after = _sample_snapshot()
+        diff = diff_snapshots(before, after)
+        assert "division-of-labour" in diff.added_entities
+        assert diff.removed_entities == []
+
+    def test_removed_entity(self):
+        before = _sample_snapshot()
+        after = _sample_snapshot(entity_evaluations=[])
+        diff = diff_snapshots(before, after)
+        assert "division-of-labour" in diff.removed_entities
+        assert diff.added_entities == []
+
+    def test_changed_score(self):
+        ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
+        ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
+        before = _sample_snapshot(entity_evaluations=[ev_before])
+        after = _sample_snapshot(entity_evaluations=[ev_after])
+        diff = diff_snapshots(before, after)
+        assert len(diff.score_changes) == 1
+        sc = diff.score_changes[0]
+        assert sc.entity_slug == "division-of-labour"
+        assert sc.dimension == "precision"
+        assert sc.before == 4.0
+        assert sc.after == 4.8
+
+    def test_changed_metric(self):
+        before = _sample_snapshot(
+            collection_metrics=[MetricValue("coverage_ratio", 0.80)]
+        )
+        after = _sample_snapshot(
+            collection_metrics=[MetricValue("coverage_ratio", 0.90)]
+        )
+        diff = diff_snapshots(before, after)
+        assert len(diff.metric_changes) == 1
+        mc = diff.metric_changes[0]
+        assert mc.name == "coverage_ratio"
+        assert mc.before == 0.80
+        assert mc.after == 0.90
+
+    def test_summary_readable(self):
+        ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
+        ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
+        before = _sample_snapshot(
+            snapshot_id="snap-1",
+            entity_evaluations=[ev_before],
+            collection_metrics=[MetricValue("coverage", 0.80)],
+        )
+        after = _sample_snapshot(
+            snapshot_id="snap-2",
+            entity_evaluations=[ev_after],
+            collection_metrics=[MetricValue("coverage", 0.90)],
+        )
+        diff = diff_snapshots(before, after)
+        text = diff.summary()
+        assert "snap-1" in text
+        assert "snap-2" in text
+        assert "precision" in text
+        assert "coverage" in text
+
+    def test_summary_no_changes(self):
+        snap = _sample_snapshot()
+        diff = diff_snapshots(snap, snap)
+        text = diff.summary()
+        assert "No changes" in text