feat(infospace): add structured evaluation output with history and diffing (S1.5)
Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot, SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files, snapshot persistence, history append, and snapshot diffing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
207
markitect/infospace/evaluation.py
Normal file
207
markitect/infospace/evaluation.py
Normal file
@@ -0,0 +1,207 @@
|
||||
"""
|
||||
Data models for structured evaluation output.
|
||||
|
||||
Provides typed containers for per-entity LLM-evaluated scores and
|
||||
collection-level metrics. All models support ``to_dict()``/``from_dict()``
|
||||
round-tripping for YAML serialisation.
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScoreEntry:
|
||||
"""A single scored dimension (e.g. definition_precision: 4.5/5.0)."""
|
||||
|
||||
name: str
|
||||
value: float
|
||||
max_value: float = 5.0
|
||||
rationale: str = ""
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
d: Dict[str, Any] = {
|
||||
"name": self.name,
|
||||
"value": self.value,
|
||||
"max_value": self.max_value,
|
||||
}
|
||||
if self.rationale:
|
||||
d["rationale"] = self.rationale
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "ScoreEntry":
|
||||
return cls(
|
||||
name=data["name"],
|
||||
value=float(data["value"]),
|
||||
max_value=float(data.get("max_value", 5.0)),
|
||||
rationale=data.get("rationale", ""),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EntityEvaluation:
|
||||
"""Per-entity evaluation result."""
|
||||
|
||||
entity_slug: str
|
||||
evaluator: str
|
||||
scores: List[ScoreEntry]
|
||||
evaluated_at: datetime
|
||||
notes: List[str] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float:
|
||||
if not self.scores:
|
||||
return 0.0
|
||||
return sum(s.value for s in self.scores) / len(self.scores)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"entity_slug": self.entity_slug,
|
||||
"evaluator": self.evaluator,
|
||||
"evaluated_at": self.evaluated_at.isoformat(),
|
||||
"overall_score": round(self.overall_score, 4),
|
||||
"scores": [s.to_dict() for s in self.scores],
|
||||
"notes": self.notes,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "EntityEvaluation":
|
||||
return cls(
|
||||
entity_slug=data["entity_slug"],
|
||||
evaluator=data["evaluator"],
|
||||
scores=[ScoreEntry.from_dict(s) for s in data["scores"]],
|
||||
evaluated_at=datetime.fromisoformat(data["evaluated_at"]),
|
||||
notes=data.get("notes", []),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricValue:
|
||||
"""A single collection-level metric."""
|
||||
|
||||
name: str
|
||||
value: float
|
||||
concern: str = ""
|
||||
details: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
d: Dict[str, Any] = {"name": self.name, "value": self.value}
|
||||
if self.concern:
|
||||
d["concern"] = self.concern
|
||||
if self.details:
|
||||
d["details"] = self.details
|
||||
return d
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "MetricValue":
|
||||
return cls(
|
||||
name=data["name"],
|
||||
value=float(data["value"]),
|
||||
concern=data.get("concern", ""),
|
||||
details=data.get("details", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvaluationSnapshot:
|
||||
"""Timestamped snapshot of entity evaluations and collection metrics."""
|
||||
|
||||
snapshot_id: str
|
||||
created_at: datetime
|
||||
schema_name: str
|
||||
entity_count: int
|
||||
entity_evaluations: List[EntityEvaluation] = field(default_factory=list)
|
||||
collection_metrics: List[MetricValue] = field(default_factory=list)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
return {
|
||||
"snapshot_id": self.snapshot_id,
|
||||
"created_at": self.created_at.isoformat(),
|
||||
"schema_name": self.schema_name,
|
||||
"entity_count": self.entity_count,
|
||||
"entity_evaluations": [e.to_dict() for e in self.entity_evaluations],
|
||||
"collection_metrics": [m.to_dict() for m in self.collection_metrics],
|
||||
"metadata": self.metadata,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: Dict[str, Any]) -> "EvaluationSnapshot":
|
||||
return cls(
|
||||
snapshot_id=data["snapshot_id"],
|
||||
created_at=datetime.fromisoformat(data["created_at"]),
|
||||
schema_name=data["schema_name"],
|
||||
entity_count=data["entity_count"],
|
||||
entity_evaluations=[
|
||||
EntityEvaluation.from_dict(e) for e in data.get("entity_evaluations", [])
|
||||
],
|
||||
collection_metrics=[
|
||||
MetricValue.from_dict(m) for m in data.get("collection_metrics", [])
|
||||
],
|
||||
metadata=data.get("metadata", {}),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ScoreChange:
|
||||
"""Delta record for a single score dimension between snapshots."""
|
||||
|
||||
entity_slug: str
|
||||
dimension: str
|
||||
before: float
|
||||
after: float
|
||||
|
||||
@property
|
||||
def delta(self) -> float:
|
||||
return self.after - self.before
|
||||
|
||||
|
||||
@dataclass
|
||||
class MetricChange:
|
||||
"""Delta record for a collection metric between snapshots."""
|
||||
|
||||
name: str
|
||||
before: float
|
||||
after: float
|
||||
|
||||
@property
|
||||
def delta(self) -> float:
|
||||
return self.after - self.before
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapshotDiff:
|
||||
"""Diff between two evaluation snapshots."""
|
||||
|
||||
before_id: str
|
||||
after_id: str
|
||||
added_entities: List[str] = field(default_factory=list)
|
||||
removed_entities: List[str] = field(default_factory=list)
|
||||
score_changes: List[ScoreChange] = field(default_factory=list)
|
||||
metric_changes: List[MetricChange] = field(default_factory=list)
|
||||
|
||||
def summary(self) -> str:
|
||||
lines = [f"Diff: {self.before_id} -> {self.after_id}"]
|
||||
if self.added_entities:
|
||||
lines.append(f" Added entities: {', '.join(self.added_entities)}")
|
||||
if self.removed_entities:
|
||||
lines.append(f" Removed entities: {', '.join(self.removed_entities)}")
|
||||
if self.score_changes:
|
||||
lines.append(f" Score changes: {len(self.score_changes)}")
|
||||
for sc in self.score_changes:
|
||||
lines.append(
|
||||
f" {sc.entity_slug}/{sc.dimension}: "
|
||||
f"{sc.before} -> {sc.after} ({sc.delta:+.2f})"
|
||||
)
|
||||
if self.metric_changes:
|
||||
lines.append(f" Metric changes: {len(self.metric_changes)}")
|
||||
for mc in self.metric_changes:
|
||||
lines.append(
|
||||
f" {mc.name}: {mc.before} -> {mc.after} ({mc.delta:+.2f})"
|
||||
)
|
||||
if not any([self.added_entities, self.removed_entities,
|
||||
self.score_changes, self.metric_changes]):
|
||||
lines.append(" No changes")
|
||||
return "\n".join(lines)
|
||||
Reference in New Issue
Block a user