Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot, SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files, snapshot persistence, history append, and snapshot diffing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
208 lines
6.3 KiB
Python
208 lines
6.3 KiB
Python
"""
|
|
Data models for structured evaluation output.
|
|
|
|
Provides typed containers for per-entity LLM-evaluated scores and
|
|
collection-level metrics. All models support ``to_dict()``/``from_dict()``
|
|
round-tripping for YAML serialisation.
|
|
"""
|
|
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime
|
|
from typing import Any, Dict, List, Optional
|
|
|
|
|
|
@dataclass
|
|
class ScoreEntry:
|
|
"""A single scored dimension (e.g. definition_precision: 4.5/5.0)."""
|
|
|
|
name: str
|
|
value: float
|
|
max_value: float = 5.0
|
|
rationale: str = ""
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {
|
|
"name": self.name,
|
|
"value": self.value,
|
|
"max_value": self.max_value,
|
|
}
|
|
if self.rationale:
|
|
d["rationale"] = self.rationale
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "ScoreEntry":
|
|
return cls(
|
|
name=data["name"],
|
|
value=float(data["value"]),
|
|
max_value=float(data.get("max_value", 5.0)),
|
|
rationale=data.get("rationale", ""),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class EntityEvaluation:
|
|
"""Per-entity evaluation result."""
|
|
|
|
entity_slug: str
|
|
evaluator: str
|
|
scores: List[ScoreEntry]
|
|
evaluated_at: datetime
|
|
notes: List[str] = field(default_factory=list)
|
|
|
|
@property
|
|
def overall_score(self) -> float:
|
|
if not self.scores:
|
|
return 0.0
|
|
return sum(s.value for s in self.scores) / len(self.scores)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"entity_slug": self.entity_slug,
|
|
"evaluator": self.evaluator,
|
|
"evaluated_at": self.evaluated_at.isoformat(),
|
|
"overall_score": round(self.overall_score, 4),
|
|
"scores": [s.to_dict() for s in self.scores],
|
|
"notes": self.notes,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "EntityEvaluation":
|
|
return cls(
|
|
entity_slug=data["entity_slug"],
|
|
evaluator=data["evaluator"],
|
|
scores=[ScoreEntry.from_dict(s) for s in data["scores"]],
|
|
evaluated_at=datetime.fromisoformat(data["evaluated_at"]),
|
|
notes=data.get("notes", []),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class MetricValue:
|
|
"""A single collection-level metric."""
|
|
|
|
name: str
|
|
value: float
|
|
concern: str = ""
|
|
details: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
d: Dict[str, Any] = {"name": self.name, "value": self.value}
|
|
if self.concern:
|
|
d["concern"] = self.concern
|
|
if self.details:
|
|
d["details"] = self.details
|
|
return d
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "MetricValue":
|
|
return cls(
|
|
name=data["name"],
|
|
value=float(data["value"]),
|
|
concern=data.get("concern", ""),
|
|
details=data.get("details", {}),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class EvaluationSnapshot:
|
|
"""Timestamped snapshot of entity evaluations and collection metrics."""
|
|
|
|
snapshot_id: str
|
|
created_at: datetime
|
|
schema_name: str
|
|
entity_count: int
|
|
entity_evaluations: List[EntityEvaluation] = field(default_factory=list)
|
|
collection_metrics: List[MetricValue] = field(default_factory=list)
|
|
metadata: Dict[str, Any] = field(default_factory=dict)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"snapshot_id": self.snapshot_id,
|
|
"created_at": self.created_at.isoformat(),
|
|
"schema_name": self.schema_name,
|
|
"entity_count": self.entity_count,
|
|
"entity_evaluations": [e.to_dict() for e in self.entity_evaluations],
|
|
"collection_metrics": [m.to_dict() for m in self.collection_metrics],
|
|
"metadata": self.metadata,
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> "EvaluationSnapshot":
|
|
return cls(
|
|
snapshot_id=data["snapshot_id"],
|
|
created_at=datetime.fromisoformat(data["created_at"]),
|
|
schema_name=data["schema_name"],
|
|
entity_count=data["entity_count"],
|
|
entity_evaluations=[
|
|
EntityEvaluation.from_dict(e) for e in data.get("entity_evaluations", [])
|
|
],
|
|
collection_metrics=[
|
|
MetricValue.from_dict(m) for m in data.get("collection_metrics", [])
|
|
],
|
|
metadata=data.get("metadata", {}),
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class ScoreChange:
|
|
"""Delta record for a single score dimension between snapshots."""
|
|
|
|
entity_slug: str
|
|
dimension: str
|
|
before: float
|
|
after: float
|
|
|
|
@property
|
|
def delta(self) -> float:
|
|
return self.after - self.before
|
|
|
|
|
|
@dataclass
|
|
class MetricChange:
|
|
"""Delta record for a collection metric between snapshots."""
|
|
|
|
name: str
|
|
before: float
|
|
after: float
|
|
|
|
@property
|
|
def delta(self) -> float:
|
|
return self.after - self.before
|
|
|
|
|
|
@dataclass
|
|
class SnapshotDiff:
|
|
"""Diff between two evaluation snapshots."""
|
|
|
|
before_id: str
|
|
after_id: str
|
|
added_entities: List[str] = field(default_factory=list)
|
|
removed_entities: List[str] = field(default_factory=list)
|
|
score_changes: List[ScoreChange] = field(default_factory=list)
|
|
metric_changes: List[MetricChange] = field(default_factory=list)
|
|
|
|
def summary(self) -> str:
|
|
lines = [f"Diff: {self.before_id} -> {self.after_id}"]
|
|
if self.added_entities:
|
|
lines.append(f" Added entities: {', '.join(self.added_entities)}")
|
|
if self.removed_entities:
|
|
lines.append(f" Removed entities: {', '.join(self.removed_entities)}")
|
|
if self.score_changes:
|
|
lines.append(f" Score changes: {len(self.score_changes)}")
|
|
for sc in self.score_changes:
|
|
lines.append(
|
|
f" {sc.entity_slug}/{sc.dimension}: "
|
|
f"{sc.before} -> {sc.after} ({sc.delta:+.2f})"
|
|
)
|
|
if self.metric_changes:
|
|
lines.append(f" Metric changes: {len(self.metric_changes)}")
|
|
for mc in self.metric_changes:
|
|
lines.append(
|
|
f" {mc.name}: {mc.before} -> {mc.after} ({mc.delta:+.2f})"
|
|
)
|
|
if not any([self.added_entities, self.removed_entities,
|
|
self.score_changes, self.metric_changes]):
|
|
lines.append(" No changes")
|
|
return "\n".join(lines)
|