Files
markitect-main/markitect/infospace/evaluation.py
tegwick f8c9ab33f0 feat(infospace): add structured evaluation output with history and diffing (S1.5)
Add data models (ScoreEntry, EntityEvaluation, EvaluationSnapshot,
SnapshotDiff) and I/O utilities for YAML frontmatter evaluation files,
snapshot persistence, history append, and snapshot diffing.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:35:22 +01:00

208 lines
6.3 KiB
Python

"""
Data models for structured evaluation output.
Provides typed containers for per-entity LLM-evaluated scores and
collection-level metrics. All models support ``to_dict()``/``from_dict()``
round-tripping for YAML serialisation.
"""
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, Dict, List, Optional
@dataclass
class ScoreEntry:
"""A single scored dimension (e.g. definition_precision: 4.5/5.0)."""
name: str
value: float
max_value: float = 5.0
rationale: str = ""
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {
"name": self.name,
"value": self.value,
"max_value": self.max_value,
}
if self.rationale:
d["rationale"] = self.rationale
return d
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ScoreEntry":
return cls(
name=data["name"],
value=float(data["value"]),
max_value=float(data.get("max_value", 5.0)),
rationale=data.get("rationale", ""),
)
@dataclass
class EntityEvaluation:
"""Per-entity evaluation result."""
entity_slug: str
evaluator: str
scores: List[ScoreEntry]
evaluated_at: datetime
notes: List[str] = field(default_factory=list)
@property
def overall_score(self) -> float:
if not self.scores:
return 0.0
return sum(s.value for s in self.scores) / len(self.scores)
def to_dict(self) -> Dict[str, Any]:
return {
"entity_slug": self.entity_slug,
"evaluator": self.evaluator,
"evaluated_at": self.evaluated_at.isoformat(),
"overall_score": round(self.overall_score, 4),
"scores": [s.to_dict() for s in self.scores],
"notes": self.notes,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "EntityEvaluation":
return cls(
entity_slug=data["entity_slug"],
evaluator=data["evaluator"],
scores=[ScoreEntry.from_dict(s) for s in data["scores"]],
evaluated_at=datetime.fromisoformat(data["evaluated_at"]),
notes=data.get("notes", []),
)
@dataclass
class MetricValue:
"""A single collection-level metric."""
name: str
value: float
concern: str = ""
details: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
d: Dict[str, Any] = {"name": self.name, "value": self.value}
if self.concern:
d["concern"] = self.concern
if self.details:
d["details"] = self.details
return d
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "MetricValue":
return cls(
name=data["name"],
value=float(data["value"]),
concern=data.get("concern", ""),
details=data.get("details", {}),
)
@dataclass
class EvaluationSnapshot:
"""Timestamped snapshot of entity evaluations and collection metrics."""
snapshot_id: str
created_at: datetime
schema_name: str
entity_count: int
entity_evaluations: List[EntityEvaluation] = field(default_factory=list)
collection_metrics: List[MetricValue] = field(default_factory=list)
metadata: Dict[str, Any] = field(default_factory=dict)
def to_dict(self) -> Dict[str, Any]:
return {
"snapshot_id": self.snapshot_id,
"created_at": self.created_at.isoformat(),
"schema_name": self.schema_name,
"entity_count": self.entity_count,
"entity_evaluations": [e.to_dict() for e in self.entity_evaluations],
"collection_metrics": [m.to_dict() for m in self.collection_metrics],
"metadata": self.metadata,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "EvaluationSnapshot":
return cls(
snapshot_id=data["snapshot_id"],
created_at=datetime.fromisoformat(data["created_at"]),
schema_name=data["schema_name"],
entity_count=data["entity_count"],
entity_evaluations=[
EntityEvaluation.from_dict(e) for e in data.get("entity_evaluations", [])
],
collection_metrics=[
MetricValue.from_dict(m) for m in data.get("collection_metrics", [])
],
metadata=data.get("metadata", {}),
)
@dataclass
class ScoreChange:
"""Delta record for a single score dimension between snapshots."""
entity_slug: str
dimension: str
before: float
after: float
@property
def delta(self) -> float:
return self.after - self.before
@dataclass
class MetricChange:
"""Delta record for a collection metric between snapshots."""
name: str
before: float
after: float
@property
def delta(self) -> float:
return self.after - self.before
@dataclass
class SnapshotDiff:
"""Diff between two evaluation snapshots."""
before_id: str
after_id: str
added_entities: List[str] = field(default_factory=list)
removed_entities: List[str] = field(default_factory=list)
score_changes: List[ScoreChange] = field(default_factory=list)
metric_changes: List[MetricChange] = field(default_factory=list)
def summary(self) -> str:
lines = [f"Diff: {self.before_id} -> {self.after_id}"]
if self.added_entities:
lines.append(f" Added entities: {', '.join(self.added_entities)}")
if self.removed_entities:
lines.append(f" Removed entities: {', '.join(self.removed_entities)}")
if self.score_changes:
lines.append(f" Score changes: {len(self.score_changes)}")
for sc in self.score_changes:
lines.append(
f" {sc.entity_slug}/{sc.dimension}: "
f"{sc.before} -> {sc.after} ({sc.delta:+.2f})"
)
if self.metric_changes:
lines.append(f" Metric changes: {len(self.metric_changes)}")
for mc in self.metric_changes:
lines.append(
f" {mc.name}: {mc.before} -> {mc.after} ({mc.delta:+.2f})"
)
if not any([self.added_entities, self.removed_entities,
self.score_changes, self.metric_changes]):
lines.append(" No changes")
return "\n".join(lines)