feat(infospace): add structured evaluation output with history and diffing (S1.5)
Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot, SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files, snapshot persistence, history append, and snapshot diffing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,6 +21,24 @@ from .validator import (
|
||||
validate_entities,
|
||||
validate_entity,
|
||||
)
|
||||
from .evaluation import (
|
||||
EntityEvaluation,
|
||||
EvaluationSnapshot,
|
||||
MetricChange,
|
||||
MetricValue,
|
||||
ScoreChange,
|
||||
ScoreEntry,
|
||||
SnapshotDiff,
|
||||
)
|
||||
from .evaluation_io import (
|
||||
append_to_history,
|
||||
diff_snapshots,
|
||||
read_entity_evaluation,
|
||||
read_history,
|
||||
read_snapshot,
|
||||
write_entity_evaluation,
|
||||
write_snapshot,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"EntityMeta",
|
||||
@@ -38,4 +56,20 @@ __all__ = [
|
||||
"ComplianceResult",
|
||||
"validate_entities",
|
||||
"validate_entity",
|
||||
# Evaluation models
|
||||
"EntityEvaluation",
|
||||
"EvaluationSnapshot",
|
||||
"MetricChange",
|
||||
"MetricValue",
|
||||
"ScoreChange",
|
||||
"ScoreEntry",
|
||||
"SnapshotDiff",
|
||||
# Evaluation I/O
|
||||
"append_to_history",
|
||||
"diff_snapshots",
|
||||
"read_entity_evaluation",
|
||||
"read_history",
|
||||
"read_snapshot",
|
||||
"write_entity_evaluation",
|
||||
"write_snapshot",
|
||||
]
|
||||
|
||||
207
markitect/infospace/evaluation.py
Normal file
207
markitect/infospace/evaluation.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Data models for structured evaluation output.
|
||||
|
||||
Provides typed containers for per-entity LLM-evaluated scores and
|
||||
collection-level metrics. All models support ``to_dict()``/``from_dict()``
|
||||
round-tripping for YAML serialisation.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScoreEntry:
|
||||
"""A single scored dimension (e.g. definition_precision: 4.5/5.0)."""
|
||||
|
||||
name: str
|
||||
value: float
|
||||
max_value: float = 5.0
|
||||
rationale: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
d: Dict[str, Any] = {
|
||||
"name": self.name,
|
||||
"value": self.value,
|
||||
"max_value": self.max_value,
|
||||
}
|
||||
if self.rationale:
|
||||
d["rationale"] = self.rationale
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ScoreEntry":
|
||||
return cls(
|
||||
name=data["name"],
|
||||
value=float(data["value"]),
|
||||
max_value=float(data.get("max_value", 5.0)),
|
||||
rationale=data.get("rationale", ""),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityEvaluation:
|
||||
"""Per-entity evaluation result."""
|
||||
|
||||
entity_slug: str
|
||||
evaluator: str
|
||||
scores: List[ScoreEntry]
|
||||
evaluated_at: datetime
|
||||
notes: List[str] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float:
|
||||
if not self.scores:
|
||||
return 0.0
|
||||
return sum(s.value for s in self.scores) / len(self.scores)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"entity_slug": self.entity_slug,
|
||||
"evaluator": self.evaluator,
|
||||
"evaluated_at": self.evaluated_at.isoformat(),
|
||||
"overall_score": round(self.overall_score, 4),
|
||||
"scores": [s.to_dict() for s in self.scores],
|
||||
"notes": self.notes,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "EntityEvaluation":
|
||||
return cls(
|
||||
entity_slug=data["entity_slug"],
|
||||
evaluator=data["evaluator"],
|
||||
scores=[ScoreEntry.from_dict(s) for s in data["scores"]],
|
||||
evaluated_at=datetime.fromisoformat(data["evaluated_at"]),
|
||||
notes=data.get("notes", []),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricValue:
|
||||
"""A single collection-level metric."""
|
||||
|
||||
name: str
|
||||
value: float
|
||||
concern: str = ""
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
d: Dict[str, Any] = {"name": self.name, "value": self.value}
|
||||
if self.concern:
|
||||
d["concern"] = self.concern
|
||||
if self.details:
|
||||
d["details"] = self.details
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "MetricValue":
|
||||
return cls(
|
||||
name=data["name"],
|
||||
value=float(data["value"]),
|
||||
concern=data.get("concern", ""),
|
||||
details=data.get("details", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvaluationSnapshot:
|
||||
"""Timestamped snapshot of entity evaluations and collection metrics."""
|
||||
|
||||
snapshot_id: str
|
||||
created_at: datetime
|
||||
schema_name: str
|
||||
entity_count: int
|
||||
entity_evaluations: List[EntityEvaluation] = field(default_factory=list)
|
||||
collection_metrics: List[MetricValue] = field(default_factory=list)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"snapshot_id": self.snapshot_id,
|
||||
"created_at": self.created_at.isoformat(),
|
||||
"schema_name": self.schema_name,
|
||||
"entity_count": self.entity_count,
|
||||
"entity_evaluations": [e.to_dict() for e in self.entity_evaluations],
|
||||
"collection_metrics": [m.to_dict() for m in self.collection_metrics],
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "EvaluationSnapshot":
|
||||
return cls(
|
||||
snapshot_id=data["snapshot_id"],
|
||||
created_at=datetime.fromisoformat(data["created_at"]),
|
||||
schema_name=data["schema_name"],
|
||||
entity_count=data["entity_count"],
|
||||
entity_evaluations=[
|
||||
EntityEvaluation.from_dict(e) for e in data.get("entity_evaluations", [])
|
||||
],
|
||||
collection_metrics=[
|
||||
MetricValue.from_dict(m) for m in data.get("collection_metrics", [])
|
||||
],
|
||||
metadata=data.get("metadata", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScoreChange:
|
||||
"""Delta record for a single score dimension between snapshots."""
|
||||
|
||||
entity_slug: str
|
||||
dimension: str
|
||||
before: float
|
||||
after: float
|
||||
|
||||
@property
|
||||
def delta(self) -> float:
|
||||
return self.after - self.before
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricChange:
|
||||
"""Delta record for a collection metric between snapshots."""
|
||||
|
||||
name: str
|
||||
before: float
|
||||
after: float
|
||||
|
||||
@property
|
||||
def delta(self) -> float:
|
||||
return self.after - self.before
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapshotDiff:
|
||||
"""Diff between two evaluation snapshots."""
|
||||
|
||||
before_id: str
|
||||
after_id: str
|
||||
added_entities: List[str] = field(default_factory=list)
|
||||
removed_entities: List[str] = field(default_factory=list)
|
||||
score_changes: List[ScoreChange] = field(default_factory=list)
|
||||
metric_changes: List[MetricChange] = field(default_factory=list)
|
||||
|
||||
def summary(self) -> str:
|
||||
lines = [f"Diff: {self.before_id} -> {self.after_id}"]
|
||||
if self.added_entities:
|
||||
lines.append(f" Added entities: {', '.join(self.added_entities)}")
|
||||
if self.removed_entities:
|
||||
lines.append(f" Removed entities: {', '.join(self.removed_entities)}")
|
||||
if self.score_changes:
|
||||
lines.append(f" Score changes: {len(self.score_changes)}")
|
||||
for sc in self.score_changes:
|
||||
lines.append(
|
||||
f" {sc.entity_slug}/{sc.dimension}: "
|
||||
f"{sc.before} -> {sc.after} ({sc.delta:+.2f})"
|
||||
)
|
||||
if self.metric_changes:
|
||||
lines.append(f" Metric changes: {len(self.metric_changes)}")
|
||||
for mc in self.metric_changes:
|
||||
lines.append(
|
||||
f" {mc.name}: {mc.before} -> {mc.after} ({mc.delta:+.2f})"
|
||||
)
|
||||
if not any([self.added_entities, self.removed_entities,
|
||||
self.score_changes, self.metric_changes]):
|
||||
lines.append(" No changes")
|
||||
return "\n".join(lines)
|
||||
213
markitect/infospace/evaluation_io.py
Normal file
213
markitect/infospace/evaluation_io.py
Normal file
@@ -0,0 +1,213 @@
|
||||
"""
|
||||
Read/write utilities for evaluation output files.
|
||||
|
||||
Per-entity evaluations are stored as markdown with YAML frontmatter.
|
||||
Snapshots and history are stored as pure YAML files.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
import yaml
|
||||
|
||||
from .evaluation import (
|
||||
EntityEvaluation,
|
||||
EvaluationSnapshot,
|
||||
MetricChange,
|
||||
MetricValue,
|
||||
ScoreChange,
|
||||
SnapshotDiff,
|
||||
)
|
||||
|
||||
_FRONTMATTER_SEP = "---"
|
||||
|
||||
|
||||
def write_entity_evaluation(evaluation: EntityEvaluation, path: Path) -> None:
|
||||
"""Write a per-entity evaluation as YAML frontmatter + markdown body."""
|
||||
frontmatter = {
|
||||
"entity_slug": evaluation.entity_slug,
|
||||
"evaluator": evaluation.evaluator,
|
||||
"evaluated_at": evaluation.evaluated_at.isoformat(),
|
||||
"overall_score": round(evaluation.overall_score, 4),
|
||||
"scores": [s.to_dict() for s in evaluation.scores],
|
||||
}
|
||||
if evaluation.notes:
|
||||
frontmatter["notes"] = evaluation.notes
|
||||
|
||||
lines: List[str] = []
|
||||
lines.append(_FRONTMATTER_SEP)
|
||||
lines.append(yaml.safe_dump(frontmatter, default_flow_style=False, sort_keys=False).rstrip())
|
||||
lines.append(_FRONTMATTER_SEP)
|
||||
lines.append("")
|
||||
|
||||
# Title
|
||||
title = evaluation.entity_slug.replace("_", " ").replace("-", " ").title()
|
||||
lines.append(f"# Evaluation: {title}")
|
||||
lines.append("")
|
||||
|
||||
# One section per score with rationale
|
||||
for score in evaluation.scores:
|
||||
lines.append(f"## {score.name} — {score.value} / {score.max_value}")
|
||||
lines.append("")
|
||||
if score.rationale:
|
||||
lines.append(score.rationale)
|
||||
lines.append("")
|
||||
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
|
||||
def read_entity_evaluation(path: Path) -> EntityEvaluation:
|
||||
"""Read a per-entity evaluation from a YAML frontmatter markdown file."""
|
||||
text = path.read_text(encoding="utf-8")
|
||||
parts = text.split(f"{_FRONTMATTER_SEP}\n", maxsplit=2)
|
||||
# parts: ["", frontmatter_text, body]
|
||||
if len(parts) < 3:
|
||||
raise ValueError(f"Invalid frontmatter in {path}")
|
||||
fm_text = parts[1]
|
||||
body = parts[2]
|
||||
|
||||
fm = yaml.safe_load(fm_text)
|
||||
|
||||
# Parse rationales from body
|
||||
rationales = _parse_rationales(body)
|
||||
|
||||
from .evaluation import ScoreEntry
|
||||
|
||||
scores = []
|
||||
for s_data in fm["scores"]:
|
||||
se = ScoreEntry.from_dict(s_data)
|
||||
if se.name in rationales:
|
||||
se.rationale = rationales[se.name]
|
||||
scores.append(se)
|
||||
|
||||
return EntityEvaluation(
|
||||
entity_slug=fm["entity_slug"],
|
||||
evaluator=fm["evaluator"],
|
||||
scores=scores,
|
||||
evaluated_at=__import__("datetime").datetime.fromisoformat(fm["evaluated_at"]),
|
||||
notes=fm.get("notes", []),
|
||||
)
|
||||
|
||||
|
||||
def _parse_rationales(body: str) -> dict:
|
||||
"""Extract rationale text per dimension from the markdown body."""
|
||||
rationales: dict = {}
|
||||
current_name = None
|
||||
current_lines: List[str] = []
|
||||
|
||||
for line in body.splitlines():
|
||||
if line.startswith("## "):
|
||||
# Save previous
|
||||
if current_name is not None:
|
||||
rationales[current_name] = "\n".join(current_lines).strip()
|
||||
# Parse "## dimension_name — 4.5 / 5.0"
|
||||
heading = line[3:].strip()
|
||||
name = heading.split("—")[0].strip() if "—" in heading else heading
|
||||
current_name = name
|
||||
current_lines = []
|
||||
elif current_name is not None:
|
||||
current_lines.append(line)
|
||||
|
||||
if current_name is not None:
|
||||
rationales[current_name] = "\n".join(current_lines).strip()
|
||||
|
||||
return rationales
|
||||
|
||||
|
||||
def write_snapshot(snapshot: EvaluationSnapshot, path: Path) -> None:
|
||||
"""Write an evaluation snapshot as a YAML file."""
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(
|
||||
yaml.safe_dump(snapshot.to_dict(), default_flow_style=False, sort_keys=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def read_snapshot(path: Path) -> EvaluationSnapshot:
|
||||
"""Read an evaluation snapshot from a YAML file."""
|
||||
data = yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||
return EvaluationSnapshot.from_dict(data)
|
||||
|
||||
|
||||
def append_to_history(snapshot: EvaluationSnapshot, history_path: Path) -> None:
|
||||
"""Append a snapshot to a YAML list file (creates if missing)."""
|
||||
history_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
existing: List[dict] = []
|
||||
if history_path.exists():
|
||||
loaded = yaml.safe_load(history_path.read_text(encoding="utf-8"))
|
||||
if loaded is not None:
|
||||
existing = loaded
|
||||
|
||||
existing.append(snapshot.to_dict())
|
||||
history_path.write_text(
|
||||
yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def read_history(history_path: Path) -> List[EvaluationSnapshot]:
|
||||
"""Read all snapshots from a YAML history file."""
|
||||
data = yaml.safe_load(history_path.read_text(encoding="utf-8"))
|
||||
if data is None:
|
||||
return []
|
||||
return [EvaluationSnapshot.from_dict(d) for d in data]
|
||||
|
||||
|
||||
def diff_snapshots(before: EvaluationSnapshot, after: EvaluationSnapshot) -> SnapshotDiff:
|
||||
"""Compute the diff between two evaluation snapshots."""
|
||||
before_slugs = {e.entity_slug for e in before.entity_evaluations}
|
||||
after_slugs = {e.entity_slug for e in after.entity_evaluations}
|
||||
|
||||
added = sorted(after_slugs - before_slugs)
|
||||
removed = sorted(before_slugs - after_slugs)
|
||||
|
||||
# Build score lookup: {slug: {dimension: value}}
|
||||
before_scores: dict = {}
|
||||
for ev in before.entity_evaluations:
|
||||
before_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores}
|
||||
|
||||
after_scores: dict = {}
|
||||
for ev in after.entity_evaluations:
|
||||
after_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores}
|
||||
|
||||
score_changes: List[ScoreChange] = []
|
||||
common_slugs = sorted(before_slugs & after_slugs)
|
||||
for slug in common_slugs:
|
||||
b_dims = before_scores[slug]
|
||||
a_dims = after_scores[slug]
|
||||
all_dims = sorted(set(b_dims) | set(a_dims))
|
||||
for dim in all_dims:
|
||||
bv = b_dims.get(dim)
|
||||
av = a_dims.get(dim)
|
||||
if bv != av:
|
||||
score_changes.append(ScoreChange(
|
||||
entity_slug=slug,
|
||||
dimension=dim,
|
||||
before=bv if bv is not None else 0.0,
|
||||
after=av if av is not None else 0.0,
|
||||
))
|
||||
|
||||
# Metric changes
|
||||
before_metrics = {m.name: m.value for m in before.collection_metrics}
|
||||
after_metrics = {m.name: m.value for m in after.collection_metrics}
|
||||
all_metric_names = sorted(set(before_metrics) | set(after_metrics))
|
||||
metric_changes: List[MetricChange] = []
|
||||
for name in all_metric_names:
|
||||
bv = before_metrics.get(name)
|
||||
av = after_metrics.get(name)
|
||||
if bv != av:
|
||||
metric_changes.append(MetricChange(
|
||||
name=name,
|
||||
before=bv if bv is not None else 0.0,
|
||||
after=av if av is not None else 0.0,
|
||||
))
|
||||
|
||||
return SnapshotDiff(
|
||||
before_id=before.snapshot_id,
|
||||
after_id=after.snapshot_id,
|
||||
added_entities=added,
|
||||
removed_entities=removed,
|
||||
score_changes=score_changes,
|
||||
metric_changes=metric_changes,
|
||||
)
|
||||
398
tests/unit/infospace/test_evaluation.py
Normal file
398
tests/unit/infospace/test_evaluation.py
Normal file
@@ -0,0 +1,398 @@
|
||||
"""Tests for markitect.infospace evaluation models and I/O."""
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from markitect.infospace import (
|
||||
EntityEvaluation,
|
||||
EvaluationSnapshot,
|
||||
MetricChange,
|
||||
MetricValue,
|
||||
ScoreChange,
|
||||
ScoreEntry,
|
||||
SnapshotDiff,
|
||||
append_to_history,
|
||||
diff_snapshots,
|
||||
read_entity_evaluation,
|
||||
read_history,
|
||||
read_snapshot,
|
||||
write_entity_evaluation,
|
||||
write_snapshot,
|
||||
)
|
||||
|
||||
|
||||
# ── Helpers ──────────────────────────────────────────────────────────
|
||||
|
||||
_NOW = datetime(2026, 2, 19, 12, 0, 0)
|
||||
|
||||
|
||||
def _sample_scores() -> list:
|
||||
return [
|
||||
ScoreEntry("definition_precision", 4.5, rationale="Clear and specific."),
|
||||
ScoreEntry("source_grounding", 4.0, rationale="Well grounded."),
|
||||
ScoreEntry("domain_relevance", 4.5),
|
||||
]
|
||||
|
||||
|
||||
def _sample_evaluation(**overrides) -> EntityEvaluation:
|
||||
defaults = dict(
|
||||
entity_slug="division-of-labour",
|
||||
evaluator="openrouter/anthropic/claude-3.5-sonnet",
|
||||
scores=_sample_scores(),
|
||||
evaluated_at=_NOW,
|
||||
notes=["Strong entity with clear provenance"],
|
||||
)
|
||||
defaults.update(overrides)
|
||||
return EntityEvaluation(**defaults)
|
||||
|
||||
|
||||
def _sample_metric() -> MetricValue:
|
||||
return MetricValue("coverage_ratio", 0.85, concern="C2", details={"checked": 85})
|
||||
|
||||
|
||||
def _sample_snapshot(**overrides) -> EvaluationSnapshot:
|
||||
defaults = dict(
|
||||
snapshot_id="2026-02-19",
|
||||
created_at=_NOW,
|
||||
schema_name="Economic Entity",
|
||||
entity_count=1,
|
||||
entity_evaluations=[_sample_evaluation()],
|
||||
collection_metrics=[_sample_metric()],
|
||||
metadata={"version": "1.0"},
|
||||
)
|
||||
defaults.update(overrides)
|
||||
return EvaluationSnapshot(**defaults)
|
||||
|
||||
|
||||
# ── Model tests ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestScoreEntry:
|
||||
def test_to_dict_from_dict_round_trip(self):
|
||||
se = ScoreEntry("precision", 4.5, 5.0, "Good definition.")
|
||||
d = se.to_dict()
|
||||
restored = ScoreEntry.from_dict(d)
|
||||
assert restored.name == se.name
|
||||
assert restored.value == se.value
|
||||
assert restored.max_value == se.max_value
|
||||
assert restored.rationale == se.rationale
|
||||
|
||||
def test_to_dict_omits_empty_rationale(self):
|
||||
se = ScoreEntry("precision", 4.5)
|
||||
d = se.to_dict()
|
||||
assert "rationale" not in d
|
||||
|
||||
def test_from_dict_defaults(self):
|
||||
se = ScoreEntry.from_dict({"name": "x", "value": 3.0})
|
||||
assert se.max_value == 5.0
|
||||
assert se.rationale == ""
|
||||
|
||||
|
||||
class TestEntityEvaluation:
|
||||
def test_overall_score_is_mean(self):
|
||||
ev = _sample_evaluation()
|
||||
# (4.5 + 4.0 + 4.5) / 3 ≈ 4.333
|
||||
assert abs(ev.overall_score - 4.333333) < 0.001
|
||||
|
||||
def test_overall_score_zero_scores(self):
|
||||
ev = _sample_evaluation(scores=[])
|
||||
assert ev.overall_score == 0.0
|
||||
|
||||
def test_to_dict_from_dict_round_trip(self):
|
||||
ev = _sample_evaluation()
|
||||
d = ev.to_dict()
|
||||
restored = EntityEvaluation.from_dict(d)
|
||||
assert restored.entity_slug == ev.entity_slug
|
||||
assert restored.evaluator == ev.evaluator
|
||||
assert len(restored.scores) == len(ev.scores)
|
||||
assert restored.evaluated_at == ev.evaluated_at
|
||||
assert restored.notes == ev.notes
|
||||
|
||||
def test_to_dict_includes_overall_score(self):
|
||||
ev = _sample_evaluation()
|
||||
d = ev.to_dict()
|
||||
assert "overall_score" in d
|
||||
assert abs(d["overall_score"] - 4.3333) < 0.01
|
||||
|
||||
|
||||
class TestMetricValue:
|
||||
def test_to_dict_from_dict_round_trip(self):
|
||||
mv = _sample_metric()
|
||||
d = mv.to_dict()
|
||||
restored = MetricValue.from_dict(d)
|
||||
assert restored.name == mv.name
|
||||
assert restored.value == mv.value
|
||||
assert restored.concern == mv.concern
|
||||
assert restored.details == mv.details
|
||||
|
||||
def test_to_dict_omits_empty_concern(self):
|
||||
mv = MetricValue("x", 1.0)
|
||||
d = mv.to_dict()
|
||||
assert "concern" not in d
|
||||
assert "details" not in d
|
||||
|
||||
|
||||
class TestEvaluationSnapshot:
|
||||
def test_to_dict_from_dict_round_trip(self):
|
||||
snap = _sample_snapshot()
|
||||
d = snap.to_dict()
|
||||
restored = EvaluationSnapshot.from_dict(d)
|
||||
assert restored.snapshot_id == snap.snapshot_id
|
||||
assert restored.created_at == snap.created_at
|
||||
assert restored.schema_name == snap.schema_name
|
||||
assert restored.entity_count == snap.entity_count
|
||||
assert len(restored.entity_evaluations) == 1
|
||||
assert len(restored.collection_metrics) == 1
|
||||
assert restored.metadata == snap.metadata
|
||||
|
||||
def test_from_dict_empty_lists(self):
|
||||
d = {
|
||||
"snapshot_id": "test",
|
||||
"created_at": _NOW.isoformat(),
|
||||
"schema_name": "Test",
|
||||
"entity_count": 0,
|
||||
}
|
||||
snap = EvaluationSnapshot.from_dict(d)
|
||||
assert snap.entity_evaluations == []
|
||||
assert snap.collection_metrics == []
|
||||
assert snap.metadata == {}
|
||||
|
||||
|
||||
# ── Per-entity file I/O ──────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestEntityEvaluationIO:
|
||||
def test_write_creates_file(self, tmp_path):
|
||||
ev = _sample_evaluation()
|
||||
p = tmp_path / "eval.md"
|
||||
write_entity_evaluation(ev, p)
|
||||
assert p.exists()
|
||||
|
||||
def test_file_has_yaml_frontmatter(self, tmp_path):
|
||||
ev = _sample_evaluation()
|
||||
p = tmp_path / "eval.md"
|
||||
write_entity_evaluation(ev, p)
|
||||
text = p.read_text()
|
||||
assert text.startswith("---\n")
|
||||
assert "\n---\n" in text
|
||||
|
||||
def test_frontmatter_contains_expected_keys(self, tmp_path):
|
||||
ev = _sample_evaluation()
|
||||
p = tmp_path / "eval.md"
|
||||
write_entity_evaluation(ev, p)
|
||||
text = p.read_text()
|
||||
for key in ["entity_slug", "evaluator", "evaluated_at", "overall_score", "scores"]:
|
||||
assert key in text
|
||||
|
||||
def test_markdown_body_contains_rationales(self, tmp_path):
|
||||
ev = _sample_evaluation()
|
||||
p = tmp_path / "eval.md"
|
||||
write_entity_evaluation(ev, p)
|
||||
text = p.read_text()
|
||||
assert "Clear and specific." in text
|
||||
assert "Well grounded." in text
|
||||
assert "## definition_precision" in text
|
||||
|
||||
def test_read_back_matches_original(self, tmp_path):
|
||||
ev = _sample_evaluation()
|
||||
p = tmp_path / "eval.md"
|
||||
write_entity_evaluation(ev, p)
|
||||
restored = read_entity_evaluation(p)
|
||||
assert restored.entity_slug == ev.entity_slug
|
||||
assert restored.evaluator == ev.evaluator
|
||||
assert restored.evaluated_at == ev.evaluated_at
|
||||
assert restored.notes == ev.notes
|
||||
assert len(restored.scores) == len(ev.scores)
|
||||
|
||||
def test_round_trip_preserves_scores(self, tmp_path):
|
||||
ev = _sample_evaluation()
|
||||
p = tmp_path / "eval.md"
|
||||
write_entity_evaluation(ev, p)
|
||||
restored = read_entity_evaluation(p)
|
||||
for orig, rest in zip(ev.scores, restored.scores):
|
||||
assert rest.name == orig.name
|
||||
assert rest.value == orig.value
|
||||
assert rest.max_value == orig.max_value
|
||||
|
||||
def test_round_trip_preserves_rationales(self, tmp_path):
|
||||
ev = _sample_evaluation()
|
||||
p = tmp_path / "eval.md"
|
||||
write_entity_evaluation(ev, p)
|
||||
restored = read_entity_evaluation(p)
|
||||
assert restored.scores[0].rationale == "Clear and specific."
|
||||
assert restored.scores[1].rationale == "Well grounded."
|
||||
# Third score has no rationale
|
||||
assert restored.scores[2].rationale == ""
|
||||
|
||||
def test_write_creates_parent_dirs(self, tmp_path):
|
||||
ev = _sample_evaluation()
|
||||
p = tmp_path / "deep" / "nested" / "eval.md"
|
||||
write_entity_evaluation(ev, p)
|
||||
assert p.exists()
|
||||
|
||||
|
||||
# ── Snapshot I/O ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestSnapshotIO:
|
||||
def test_write_creates_file(self, tmp_path):
|
||||
snap = _sample_snapshot()
|
||||
p = tmp_path / "snapshot.yaml"
|
||||
write_snapshot(snap, p)
|
||||
assert p.exists()
|
||||
|
||||
def test_read_back_matches_original(self, tmp_path):
|
||||
snap = _sample_snapshot()
|
||||
p = tmp_path / "snapshot.yaml"
|
||||
write_snapshot(snap, p)
|
||||
restored = read_snapshot(p)
|
||||
assert restored.snapshot_id == snap.snapshot_id
|
||||
assert restored.created_at == snap.created_at
|
||||
assert restored.schema_name == snap.schema_name
|
||||
assert restored.entity_count == snap.entity_count
|
||||
|
||||
def test_round_trip_preserves_entity_evaluations(self, tmp_path):
|
||||
snap = _sample_snapshot()
|
||||
p = tmp_path / "snapshot.yaml"
|
||||
write_snapshot(snap, p)
|
||||
restored = read_snapshot(p)
|
||||
assert len(restored.entity_evaluations) == 1
|
||||
ev = restored.entity_evaluations[0]
|
||||
assert ev.entity_slug == "division-of-labour"
|
||||
assert len(ev.scores) == 3
|
||||
|
||||
def test_round_trip_preserves_collection_metrics(self, tmp_path):
|
||||
snap = _sample_snapshot()
|
||||
p = tmp_path / "snapshot.yaml"
|
||||
write_snapshot(snap, p)
|
||||
restored = read_snapshot(p)
|
||||
assert len(restored.collection_metrics) == 1
|
||||
m = restored.collection_metrics[0]
|
||||
assert m.name == "coverage_ratio"
|
||||
assert m.value == 0.85
|
||||
assert m.concern == "C2"
|
||||
|
||||
|
||||
# ── History ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestHistory:
|
||||
def test_append_creates_new_file(self, tmp_path):
|
||||
snap = _sample_snapshot()
|
||||
hp = tmp_path / "history.yaml"
|
||||
append_to_history(snap, hp)
|
||||
assert hp.exists()
|
||||
history = read_history(hp)
|
||||
assert len(history) == 1
|
||||
|
||||
def test_append_adds_to_existing(self, tmp_path):
|
||||
hp = tmp_path / "history.yaml"
|
||||
snap1 = _sample_snapshot(snapshot_id="snap-1")
|
||||
snap2 = _sample_snapshot(snapshot_id="snap-2")
|
||||
append_to_history(snap1, hp)
|
||||
append_to_history(snap2, hp)
|
||||
history = read_history(hp)
|
||||
assert len(history) == 2
|
||||
assert history[0].snapshot_id == "snap-1"
|
||||
assert history[1].snapshot_id == "snap-2"
|
||||
|
||||
def test_multiple_appends_all_preserved(self, tmp_path):
|
||||
hp = tmp_path / "history.yaml"
|
||||
for i in range(5):
|
||||
snap = _sample_snapshot(snapshot_id=f"snap-{i}")
|
||||
append_to_history(snap, hp)
|
||||
history = read_history(hp)
|
||||
assert len(history) == 5
|
||||
assert [h.snapshot_id for h in history] == [f"snap-{i}" for i in range(5)]
|
||||
|
||||
def test_read_history_returns_list_in_order(self, tmp_path):
|
||||
hp = tmp_path / "history.yaml"
|
||||
snap_a = _sample_snapshot(snapshot_id="a")
|
||||
snap_b = _sample_snapshot(snapshot_id="b")
|
||||
append_to_history(snap_a, hp)
|
||||
append_to_history(snap_b, hp)
|
||||
history = read_history(hp)
|
||||
assert history[0].snapshot_id == "a"
|
||||
assert history[1].snapshot_id == "b"
|
||||
|
||||
|
||||
# ── Diffing ──────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestDiffSnapshots:
|
||||
def test_identical_snapshots_empty_diff(self):
|
||||
snap = _sample_snapshot()
|
||||
diff = diff_snapshots(snap, snap)
|
||||
assert diff.added_entities == []
|
||||
assert diff.removed_entities == []
|
||||
assert diff.score_changes == []
|
||||
assert diff.metric_changes == []
|
||||
|
||||
def test_added_entity(self):
|
||||
before = _sample_snapshot(entity_evaluations=[])
|
||||
after = _sample_snapshot()
|
||||
diff = diff_snapshots(before, after)
|
||||
assert "division-of-labour" in diff.added_entities
|
||||
assert diff.removed_entities == []
|
||||
|
||||
def test_removed_entity(self):
|
||||
before = _sample_snapshot()
|
||||
after = _sample_snapshot(entity_evaluations=[])
|
||||
diff = diff_snapshots(before, after)
|
||||
assert "division-of-labour" in diff.removed_entities
|
||||
assert diff.added_entities == []
|
||||
|
||||
def test_changed_score(self):
|
||||
ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
|
||||
ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
|
||||
before = _sample_snapshot(entity_evaluations=[ev_before])
|
||||
after = _sample_snapshot(entity_evaluations=[ev_after])
|
||||
diff = diff_snapshots(before, after)
|
||||
assert len(diff.score_changes) == 1
|
||||
sc = diff.score_changes[0]
|
||||
assert sc.entity_slug == "division-of-labour"
|
||||
assert sc.dimension == "precision"
|
||||
assert sc.before == 4.0
|
||||
assert sc.after == 4.8
|
||||
|
||||
def test_changed_metric(self):
|
||||
before = _sample_snapshot(
|
||||
collection_metrics=[MetricValue("coverage_ratio", 0.80)]
|
||||
)
|
||||
after = _sample_snapshot(
|
||||
collection_metrics=[MetricValue("coverage_ratio", 0.90)]
|
||||
)
|
||||
diff = diff_snapshots(before, after)
|
||||
assert len(diff.metric_changes) == 1
|
||||
mc = diff.metric_changes[0]
|
||||
assert mc.name == "coverage_ratio"
|
||||
assert mc.before == 0.80
|
||||
assert mc.after == 0.90
|
||||
|
||||
def test_summary_readable(self):
|
||||
ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
|
||||
ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
|
||||
before = _sample_snapshot(
|
||||
snapshot_id="snap-1",
|
||||
entity_evaluations=[ev_before],
|
||||
collection_metrics=[MetricValue("coverage", 0.80)],
|
||||
)
|
||||
after = _sample_snapshot(
|
||||
snapshot_id="snap-2",
|
||||
entity_evaluations=[ev_after],
|
||||
collection_metrics=[MetricValue("coverage", 0.90)],
|
||||
)
|
||||
diff = diff_snapshots(before, after)
|
||||
text = diff.summary()
|
||||
assert "snap-1" in text
|
||||
assert "snap-2" in text
|
||||
assert "precision" in text
|
||||
assert "coverage" in text
|
||||
|
||||
def test_summary_no_changes(self):
|
||||
snap = _sample_snapshot()
|
||||
diff = diff_snapshots(snap, snap)
|
||||
text = diff.summary()
|
||||
assert "No changes" in text
|
||||
Reference in New Issue
Block a user