eval history and metrics

This commit is contained in:
2026-05-14 15:35:04 +02:00
parent d0c1f82863
commit 7f54dec585
9 changed files with 870 additions and 16 deletions

View File

@@ -1,5 +1,22 @@
from .errors import InfospaceError
from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue, ScoreEntry
from .evaluation_io import (
append_to_history,
read_entity_evaluation,
read_history,
read_snapshot,
write_entity_evaluation,
write_snapshot,
)
from .history import (
find_snapshot,
get_history,
get_latest_snapshot,
metric_trend,
read_metrics_file,
record_check_results,
write_metrics_file,
)
from .lifecycle import add_artifact, create_infospace, load_infospace
from .models import (
DisciplineBinding,
@@ -26,8 +43,21 @@ __all__ = [
"TopicConfig",
"ViabilityThreshold",
"add_artifact",
"append_to_history",
"create_infospace",
"find_snapshot",
"get_history",
"get_latest_snapshot",
"list_entities",
"list_relations",
"load_infospace",
"metric_trend",
"read_entity_evaluation",
"read_history",
"read_metrics_file",
"read_snapshot",
"record_check_results",
"write_entity_evaluation",
"write_metrics_file",
"write_snapshot",
]

View File

@@ -5,7 +5,9 @@ import json
import sys
from pathlib import Path
from .checks import run_collection_checks
from .errors import InfospaceError
from .history import find_snapshot, get_history, metric_trend, record_check_results
from .lifecycle import add_artifact, create_infospace, load_infospace
from .markdown_adapter import validate_infospace_artifacts
from .semantics import list_entities, list_relations
@@ -42,6 +44,24 @@ def build_parser() -> argparse.ArgumentParser:
relations = sub.add_parser("relations", help="List parsed relation artifacts")
relations.add_argument("root")
history = sub.add_parser("history", help="List evaluation snapshot history")
history.add_argument("root")
history.add_argument("--metric", default="")
history_diff = sub.add_parser(
"history-diff",
help="Diff two evaluation snapshots by snapshot ID or date",
)
history_diff.add_argument("root")
history_diff.add_argument("before")
history_diff.add_argument("after")
metrics = sub.add_parser(
"metrics",
help="Run collection checks and persist metrics/history",
)
metrics.add_argument("root")
return parser
@@ -96,6 +116,40 @@ def main(argv: list[str] | None = None) -> int:
]
}
)
elif args.command == "history":
history = get_history(Path(args.root))
if args.metric:
_write_json(
{
"metric": args.metric,
"trend": metric_trend(history, args.metric),
}
)
else:
_write_json({"history": [item.to_dict() for item in history]})
elif args.command == "history-diff":
history = get_history(Path(args.root))
before = find_snapshot(history, args.before)
after = find_snapshot(history, args.after)
if before is None or after is None:
missing = []
if before is None:
missing.append(args.before)
if after is None:
missing.append(args.after)
raise InfospaceError(
"missing_snapshot",
"Could not resolve requested snapshot reference",
{"missing_refs": missing},
)
_write_json({"diff": before.diff(after).to_dict()})
elif args.command == "metrics":
infospace = load_infospace(Path(args.root))
result = record_check_results(
infospace.root,
run_collection_checks(infospace.artifacts),
)
_write_json(result.to_dict())
else:
parser.error(f"Unhandled command: {args.command}")
except InfospaceError as exc:

View File

@@ -40,6 +40,11 @@ class EntityEvaluation:
evaluated_at: datetime
notes: list[str] = field(default_factory=list)
@property
def entity_slug(self) -> str:
"""Legacy alias for readers moving from entity-oriented history files."""
return self.artifact_id
@property
def overall_score(self) -> float:
if not self.scores:
@@ -102,6 +107,16 @@ class EvaluationSnapshot:
collection_metrics: list[MetricValue] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
@property
def entity_count(self) -> int:
"""Legacy alias retained for old infospace history readers."""
return self.artifact_count
@property
def entity_evaluations(self) -> list[EntityEvaluation]:
"""Legacy alias retained for old infospace history readers."""
return self.artifact_evaluations
def to_dict(self) -> dict[str, Any]:
return {
"snapshot_id": self.snapshot_id,
@@ -122,11 +137,14 @@ class EvaluationSnapshot:
return cls(
snapshot_id=str(data["snapshot_id"]),
created_at=datetime.fromisoformat(str(data["created_at"])),
schema_name=str(data["schema_name"]),
artifact_count=int(data["artifact_count"]),
schema_name=str(data.get("schema_name") or "default"),
artifact_count=int(data.get("artifact_count", data.get("entity_count", 0))),
artifact_evaluations=[
EntityEvaluation.from_dict(item)
for item in data.get("artifact_evaluations", [])
for item in data.get(
"artifact_evaluations",
data.get("entity_evaluations", []),
)
],
collection_metrics=[
MetricValue.from_dict(item) for item in data.get("collection_metrics", [])
@@ -134,6 +152,9 @@ class EvaluationSnapshot:
metadata=dict(data.get("metadata") or {}),
)
def diff(self, after: "EvaluationSnapshot") -> "SnapshotDiff":
return diff_snapshots(self, after)
@dataclass(frozen=True)
class ScoreChange:
@@ -146,6 +167,20 @@ class ScoreChange:
def delta(self) -> float:
return self.after - self.before
@property
def entity_slug(self) -> str:
"""Legacy alias for old diff consumers."""
return self.artifact_id
def to_dict(self) -> dict[str, Any]:
return {
"artifact_id": self.artifact_id,
"dimension": self.dimension,
"before": self.before,
"after": self.after,
"delta": self.delta,
}
@dataclass(frozen=True)
class MetricChange:
@@ -157,6 +192,14 @@ class MetricChange:
def delta(self) -> float:
return self.after - self.before
def to_dict(self) -> dict[str, Any]:
return {
"name": self.name,
"before": self.before,
"after": self.after,
"delta": self.delta,
}
@dataclass(frozen=True)
class SnapshotDiff:
@@ -167,6 +210,51 @@ class SnapshotDiff:
score_changes: list[ScoreChange] = field(default_factory=list)
metric_changes: list[MetricChange] = field(default_factory=list)
@property
def added_entities(self) -> list[str]:
"""Legacy alias for old history diff output."""
return self.added_artifacts
@property
def removed_entities(self) -> list[str]:
"""Legacy alias for old history diff output."""
return self.removed_artifacts
def to_dict(self) -> dict[str, Any]:
return {
"before_id": self.before_id,
"after_id": self.after_id,
"added_artifacts": self.added_artifacts,
"removed_artifacts": self.removed_artifacts,
"score_changes": [change.to_dict() for change in self.score_changes],
"metric_changes": [change.to_dict() for change in self.metric_changes],
}
def summary(self) -> str:
lines = [f"Snapshot diff: {self.before_id} -> {self.after_id}"]
if not (
self.added_artifacts
or self.removed_artifacts
or self.score_changes
or self.metric_changes
):
return "\n".join([*lines, "No changes."])
for artifact_id in self.added_artifacts:
lines.append(f"Added artifact: {artifact_id}")
for artifact_id in self.removed_artifacts:
lines.append(f"Removed artifact: {artifact_id}")
for change in self.score_changes:
lines.append(
f"Score {change.artifact_id} {change.dimension}: "
f"{change.before} -> {change.after} ({change.delta:+.4f})"
)
for change in self.metric_changes:
lines.append(
f"Metric {change.name}: "
f"{change.before} -> {change.after} ({change.delta:+.4f})"
)
return "\n".join(lines)
def diff_snapshots(
before: EvaluationSnapshot,
@@ -174,22 +262,29 @@ def diff_snapshots(
) -> SnapshotDiff:
before_scores = _score_index(before)
after_scores = _score_index(after)
before_artifacts = {artifact_id for artifact_id, _ in before_scores}
after_artifacts = {artifact_id for artifact_id, _ in after_scores}
before_artifacts = {
evaluation.artifact_id for evaluation in before.artifact_evaluations
}
after_artifacts = {evaluation.artifact_id for evaluation in after.artifact_evaluations}
score_changes = [
ScoreChange(artifact_id, dimension, before_scores[key], after_scores[key])
for key in sorted(before_scores.keys() & after_scores.keys())
ScoreChange(
artifact_id,
dimension,
before_scores.get(key, 0.0),
after_scores.get(key, 0.0),
)
for key in sorted(before_scores.keys() | after_scores.keys())
for artifact_id, dimension in [key]
if before_scores[key] != after_scores[key]
if before_scores.get(key) != after_scores.get(key)
]
before_metrics = {metric.name: metric.value for metric in before.collection_metrics}
after_metrics = {metric.name: metric.value for metric in after.collection_metrics}
metric_changes = [
MetricChange(name, before_metrics[name], after_metrics[name])
for name in sorted(before_metrics.keys() & after_metrics.keys())
if before_metrics[name] != after_metrics[name]
MetricChange(name, before_metrics.get(name, 0.0), after_metrics.get(name, 0.0))
for name in sorted(before_metrics.keys() | after_metrics.keys())
if before_metrics.get(name) != after_metrics.get(name)
]
return SnapshotDiff(

View File

@@ -0,0 +1,181 @@
from __future__ import annotations
from datetime import datetime
from pathlib import Path
from typing import Any
import yaml
from .errors import InfospaceError
from .evaluation import (
EntityEvaluation,
EvaluationSnapshot,
ScoreEntry,
diff_snapshots,
)
FRONTMATTER_MARKER = "---"
def write_entity_evaluation(
evaluation: EntityEvaluation,
path: str | Path,
) -> None:
target = Path(path)
frontmatter: dict[str, Any] = {
"artifact_id": evaluation.artifact_id,
"evaluator": evaluation.evaluator,
"evaluated_at": evaluation.evaluated_at.isoformat(),
"overall_score": round(evaluation.overall_score, 4),
"scores": [score.to_dict() for score in evaluation.scores],
}
if evaluation.notes:
frontmatter["notes"] = evaluation.notes
lines = [
FRONTMATTER_MARKER,
yaml.safe_dump(frontmatter, sort_keys=False).rstrip(),
FRONTMATTER_MARKER,
"",
f"# Evaluation: {evaluation.artifact_id}",
"",
]
for score in evaluation.scores:
lines.append(f"## {score.name} - {score.value} / {score.max_value}")
lines.append("")
if score.rationale:
lines.append(score.rationale)
lines.append("")
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text("\n".join(lines), encoding="utf-8")
def read_entity_evaluation(path: str | Path) -> EntityEvaluation:
source = Path(path)
frontmatter, body = _read_frontmatter_markdown(source)
rationales = _parse_rationales(body)
scores = [
_score_with_body_rationale(ScoreEntry.from_dict(item), rationales)
for item in frontmatter.get("scores", [])
]
artifact_id = frontmatter.get("artifact_id", frontmatter.get("entity_slug"))
if not artifact_id:
raise InfospaceError(
"invalid_evaluation_file",
f"Missing artifact_id in evaluation file: {source}",
{"path": str(source)},
)
return EntityEvaluation(
artifact_id=str(artifact_id),
evaluator=str(frontmatter["evaluator"]),
scores=scores,
evaluated_at=datetime.fromisoformat(str(frontmatter["evaluated_at"])),
notes=list(frontmatter.get("notes") or []),
)
def write_snapshot(snapshot: EvaluationSnapshot, path: str | Path) -> None:
target = Path(path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(
yaml.safe_dump(snapshot.to_dict(), sort_keys=False),
encoding="utf-8",
)
def read_snapshot(path: str | Path) -> EvaluationSnapshot:
source = Path(path)
data = yaml.safe_load(source.read_text(encoding="utf-8"))
if not isinstance(data, dict):
raise InfospaceError(
"invalid_snapshot_file",
f"Expected mapping in snapshot file: {source}",
{"path": str(source)},
)
return EvaluationSnapshot.from_dict(data)
def append_to_history(snapshot: EvaluationSnapshot, history_path: str | Path) -> None:
target = Path(history_path)
existing = [item.to_dict() for item in read_history(target)] if target.exists() else []
existing.append(snapshot.to_dict())
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(
yaml.safe_dump(existing, sort_keys=False),
encoding="utf-8",
)
def read_history(history_path: str | Path) -> list[EvaluationSnapshot]:
source = Path(history_path)
if not source.is_file():
return []
data = yaml.safe_load(source.read_text(encoding="utf-8"))
if data is None:
return []
if not isinstance(data, list):
raise InfospaceError(
"invalid_history_file",
f"Expected list in history file: {source}",
{"path": str(source)},
)
return [EvaluationSnapshot.from_dict(item) for item in data]
def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]:
text = path.read_text(encoding="utf-8")
if not text.startswith(f"{FRONTMATTER_MARKER}\n"):
raise InfospaceError(
"invalid_evaluation_file",
f"Missing YAML frontmatter in evaluation file: {path}",
{"path": str(path)},
)
end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1)
if end == -1:
raise InfospaceError(
"invalid_evaluation_file",
f"Unclosed YAML frontmatter in evaluation file: {path}",
{"path": str(path)},
)
raw = text[len(FRONTMATTER_MARKER) + 1 : end]
body = text[end + len(FRONTMATTER_MARKER) + 2 :]
data = yaml.safe_load(raw)
if not isinstance(data, dict):
raise InfospaceError(
"invalid_evaluation_file",
f"Expected mapping frontmatter in evaluation file: {path}",
{"path": str(path)},
)
return data, body
def _parse_rationales(body: str) -> dict[str, str]:
rationales: dict[str, str] = {}
current_name: str | None = None
current_lines: list[str] = []
for line in body.splitlines():
if line.startswith("## "):
if current_name is not None:
rationales[current_name] = "\n".join(current_lines).strip()
heading = line[3:].strip()
current_name = heading.split(" - ", maxsplit=1)[0].strip()
current_lines = []
elif current_name is not None:
current_lines.append(line)
if current_name is not None:
rationales[current_name] = "\n".join(current_lines).strip()
return rationales
def _score_with_body_rationale(
score: ScoreEntry,
rationales: dict[str, str],
) -> ScoreEntry:
rationale = rationales.get(score.name, score.rationale)
return ScoreEntry(
name=score.name,
value=score.value,
max_value=score.max_value,
rationale=rationale,
)

View File

@@ -0,0 +1,254 @@
from __future__ import annotations
import uuid
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import yaml
from .checks import CollectionCheckReport
from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue
from .evaluation_io import append_to_history, read_history, write_snapshot
from .lifecycle import load_infospace
from .viability import evaluate_viability
METRICS_PATH = Path("output/metrics/metrics.yaml")
HISTORY_PATH = Path("output/metrics/history.yaml")
VIABILITY_PATH = Path("output/metrics/viability.yaml")
SNAPSHOT_DIR = Path("output/metrics/snapshots")
@dataclass(frozen=True)
class RecordedCheckResult:
snapshot: EvaluationSnapshot
metrics: dict[str, Any]
viability: dict[str, Any] | None = None
def to_dict(self) -> dict[str, Any]:
return {
"snapshot": self.snapshot.to_dict(),
"metrics": self.metrics,
"viability": self.viability,
}
def snapshot_from_checks(
check_report: CollectionCheckReport,
artifact_count: int,
*,
schema_name: str = "default",
metadata: dict[str, Any] | None = None,
artifact_evaluations: list[EntityEvaluation] | None = None,
) -> EvaluationSnapshot:
metrics = _numeric_metrics(check_report.metrics)
collection_metrics = [
MetricValue(name=name, value=value, concern=_concern_for_metric(name))
for name, value in sorted(metrics.items())
]
collection_metrics.extend(
MetricValue(name=name, value=value, concern="evaluation")
for name, value in sorted(
_evaluation_metrics(artifact_evaluations or []).items()
)
)
return EvaluationSnapshot(
snapshot_id=str(uuid.uuid4())[:8],
created_at=datetime.now(timezone.utc),
schema_name=schema_name,
artifact_count=artifact_count,
artifact_evaluations=artifact_evaluations or [],
collection_metrics=collection_metrics,
metadata=metadata or {},
)
def write_metrics_file(metrics: dict[str, Any], path: str | Path) -> None:
target = Path(path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(
yaml.safe_dump(
{
key: _normalize_metric_value(value)
for key, value in sorted(metrics.items())
},
sort_keys=True,
),
encoding="utf-8",
)
def read_metrics_file(path: str | Path) -> dict[str, Any]:
source = Path(path)
if not source.is_file():
return {}
data = yaml.safe_load(source.read_text(encoding="utf-8"))
return data if isinstance(data, dict) else {}
def record_check_results(
root: str | Path,
check_report: CollectionCheckReport,
*,
artifact_evaluations: list[EntityEvaluation] | None = None,
schema_name: str = "default",
metadata: dict[str, Any] | None = None,
) -> RecordedCheckResult:
infospace = load_infospace(root)
artifact_count = int(
check_report.details.get("artifact_count", len(infospace.artifacts))
)
snapshot = snapshot_from_checks(
check_report,
artifact_count,
schema_name=schema_name,
metadata={"source": "collection-checks", **(metadata or {})},
artifact_evaluations=artifact_evaluations,
)
metrics_file = infospace.root / METRICS_PATH
existing = read_metrics_file(metrics_file)
merged = {
**existing,
**check_report.metrics,
**_evaluation_metrics(artifact_evaluations or []),
}
write_metrics_file(merged, metrics_file)
history_path = infospace.root / HISTORY_PATH
append_to_history(snapshot, history_path)
write_snapshot(
snapshot,
infospace.root / SNAPSHOT_DIR / f"{snapshot.snapshot_id}.yaml",
)
viability = build_viability_report(infospace.root, merged)
write_viability_report(infospace.root, viability)
return RecordedCheckResult(snapshot=snapshot, metrics=merged, viability=viability)
def get_history(root: str | Path) -> list[EvaluationSnapshot]:
return read_history(Path(root) / HISTORY_PATH)
def get_latest_snapshot(root: str | Path) -> EvaluationSnapshot | None:
history = get_history(root)
return history[-1] if history else None
def find_snapshot(
history: list[EvaluationSnapshot],
ref: str,
) -> EvaluationSnapshot | None:
for snapshot in history:
if snapshot.snapshot_id == ref:
return snapshot
return find_snapshot_by_date(history, ref)
def find_snapshot_by_date(
history: list[EvaluationSnapshot],
date_ref: str,
) -> EvaluationSnapshot | None:
if not history:
return None
try:
target = datetime.fromisoformat(
date_ref if "T" in date_ref else f"{date_ref}T00:00:00"
)
except ValueError:
return None
if target.tzinfo is None:
target = target.replace(tzinfo=timezone.utc)
def delta(snapshot: EvaluationSnapshot) -> float:
created_at = snapshot.created_at
if created_at.tzinfo is None:
created_at = created_at.replace(tzinfo=timezone.utc)
return abs((created_at - target).total_seconds())
return min(history, key=delta)
def metric_trend(
history: list[EvaluationSnapshot],
metric_name: str,
) -> list[dict[str, Any]]:
trend: list[dict[str, Any]] = []
for snapshot in history:
for metric in snapshot.collection_metrics:
if metric.name == metric_name:
trend.append(
{"date": snapshot.created_at.isoformat(), "value": metric.value}
)
break
return trend
def build_viability_report(
root: str | Path,
metrics: dict[str, Any] | None = None,
) -> dict[str, Any]:
infospace = load_infospace(root)
current = (
metrics
if metrics is not None
else read_metrics_file(infospace.root / METRICS_PATH)
)
numeric = _numeric_metrics(current)
report = evaluate_viability(numeric, infospace.config.viability)
return {
"passed": report.passed,
"results": {
name: {
"metric": result.metric,
"value": result.value,
"threshold": result.threshold.to_dict(),
"passed": result.passed,
}
for name, result in report.results.items()
},
}
def write_viability_report(root: str | Path, report: dict[str, Any]) -> None:
target = Path(root) / VIABILITY_PATH
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
def _evaluation_metrics(evaluations: list[EntityEvaluation]) -> dict[str, float | int]:
if not evaluations:
return {}
return {
"evaluated_artifact_count": len(evaluations),
"per_artifact_mean": sum(item.overall_score for item in evaluations)
/ len(evaluations),
}
def _numeric_metrics(metrics: dict[str, Any]) -> dict[str, float]:
return {
str(name): float(value)
for name, value in metrics.items()
if isinstance(value, (int, float)) and not isinstance(value, bool)
}
def _normalize_metric_value(value: Any) -> Any:
if isinstance(value, bool):
return value
if isinstance(value, float):
return round(value, 6)
return value
def _concern_for_metric(name: str) -> str:
mapping = {
"redundancy_ratio": "C1",
"coverage_ratio": "C2",
"coherence_components": "C3",
"consistency_cycles": "C4",
"granularity_entropy": "C5",
}
return mapping.get(name, "")