generated from coulomb/repo-seed
Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate, schema-thrash, token percentiles, success) + persisted baseline trend. effect.py: before/after per-pattern effectiveness with an improved verdict per metric. measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8. 13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
103 lines
3.5 KiB
Python
103 lines
3.5 KiB
Python
"""Fleet metrics + persisted baselines (PRD §6.5 FR-M3; T01).
|
|
|
|
Computes the headline health metrics of the captured corpus — the same quantities
|
|
the friction assessment reported — so they can be tracked over time and compared
|
|
before/after a change. Reuses :func:`detect.signals.tool_bucket` (WP-0005) and the
|
|
digest ``error_snippets`` (WP-0006); no new capture.
|
|
|
|
A **baseline** is a timestamped metrics snapshot appended to a JSONL file, so
|
|
successive runs build a trend the entrypoint (T03) can chart.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import collections
|
|
import json
|
|
import os
|
|
from datetime import datetime, timezone
|
|
|
|
from ..detect.signals import tool_bucket
|
|
|
|
|
|
def _now() -> str:
|
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
|
|
def _pct(values: list[float], q: float) -> float:
|
|
if not values:
|
|
return 0.0
|
|
s = sorted(values)
|
|
return round(s[int(q * (len(s) - 1))], 3)
|
|
|
|
|
|
def _median(values: list[float]) -> float:
|
|
return _pct(values, 0.5)
|
|
|
|
|
|
def _buckets(digest: dict) -> collections.Counter:
|
|
b: collections.Counter = collections.Counter()
|
|
for tool, n in (digest.get("tool_histogram") or {}).items():
|
|
b[tool_bucket(tool)] += n
|
|
return b
|
|
|
|
|
|
def session_metrics(digest: dict) -> dict:
|
|
"""Per-session metrics used to build fleet aggregates."""
|
|
b = _buckets(digest)
|
|
total = sum(b.values()) or 1
|
|
overhead = b["statehub_mcp"] + b["task_mgmt"] + b["schema_load"]
|
|
cost = digest.get("cost", {})
|
|
tokens = cost.get("input_tokens", 0) + cost.get("output_tokens", 0)
|
|
return {
|
|
"infra_overhead_share": overhead / total,
|
|
"tool_calls": total,
|
|
"schema_load": b["schema_load"],
|
|
"error_occurrences": sum(s.get("count", 1) for s in (digest.get("error_snippets") or [])),
|
|
"has_error": bool(digest.get("error_snippets")),
|
|
"tokens": tokens,
|
|
"success": digest.get("outcome") == "success",
|
|
}
|
|
|
|
|
|
def aggregate(digests: list[dict], *, schema_thrash_threshold: int = 5) -> dict:
|
|
"""Fleet-level metrics over a set of (already quality-filtered) digests."""
|
|
per = [session_metrics(d) for d in digests]
|
|
n = len(per)
|
|
if n == 0:
|
|
return {"n_sessions": 0}
|
|
shares = [m["infra_overhead_share"] for m in per]
|
|
tokens = [m["tokens"] for m in per]
|
|
return {
|
|
"n_sessions": n,
|
|
"infra_overhead_share_median": _median(shares),
|
|
"infra_overhead_share_p90": _pct(shares, 0.9),
|
|
"error_rate": round(sum(m["has_error"] for m in per) / n, 3),
|
|
"recurring_error_occurrences": sum(m["error_occurrences"] for m in per),
|
|
"schema_thrash_sessions": sum(1 for m in per if m["schema_load"] >= schema_thrash_threshold),
|
|
"tokens_p50": _pct(tokens, 0.5),
|
|
"tokens_p90": _pct(tokens, 0.9),
|
|
"success_rate": round(sum(m["success"] for m in per) / n, 3),
|
|
}
|
|
|
|
|
|
def snapshot(digests: list[dict], *, label: str = "") -> dict:
|
|
m = aggregate(digests)
|
|
m["captured_at"] = _now()
|
|
m["label"] = label
|
|
return m
|
|
|
|
|
|
def save_baseline(metrics: dict, path: str) -> None:
|
|
"""Append a metrics snapshot to the baseline JSONL trend file."""
|
|
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
|
|
with open(path, "a", encoding="utf-8") as fh:
|
|
fh.write(json.dumps(metrics, sort_keys=True))
|
|
fh.write("\n")
|
|
|
|
|
|
def load_baselines(path: str) -> list[dict]:
|
|
if not os.path.exists(path):
|
|
return []
|
|
with open(path, encoding="utf-8") as fh:
|
|
return [json.loads(line) for line in fh if line.strip()]
|