generated from coulomb/repo-seed
session-memory: Phase 4 Measure — baseline, effectiveness, trend (WP-0009)
Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate, schema-thrash, token percentiles, success) + persisted baseline trend. effect.py: before/after per-pattern effectiveness with an improved verdict per metric. measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8. 13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
102
session_memory/measure/metrics.py
Normal file
102
session_memory/measure/metrics.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""Fleet metrics + persisted baselines (PRD §6.5 FR-M3; T01).
|
||||
|
||||
Computes the headline health metrics of the captured corpus — the same quantities
|
||||
the friction assessment reported — so they can be tracked over time and compared
|
||||
before/after a change. Reuses :func:`detect.signals.tool_bucket` (WP-0005) and the
|
||||
digest ``error_snippets`` (WP-0006); no new capture.
|
||||
|
||||
A **baseline** is a timestamped metrics snapshot appended to a JSONL file, so
|
||||
successive runs build a trend the entrypoint (T03) can chart.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import collections
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from ..detect.signals import tool_bucket
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _pct(values: list[float], q: float) -> float:
|
||||
if not values:
|
||||
return 0.0
|
||||
s = sorted(values)
|
||||
return round(s[int(q * (len(s) - 1))], 3)
|
||||
|
||||
|
||||
def _median(values: list[float]) -> float:
|
||||
return _pct(values, 0.5)
|
||||
|
||||
|
||||
def _buckets(digest: dict) -> collections.Counter:
|
||||
b: collections.Counter = collections.Counter()
|
||||
for tool, n in (digest.get("tool_histogram") or {}).items():
|
||||
b[tool_bucket(tool)] += n
|
||||
return b
|
||||
|
||||
|
||||
def session_metrics(digest: dict) -> dict:
|
||||
"""Per-session metrics used to build fleet aggregates."""
|
||||
b = _buckets(digest)
|
||||
total = sum(b.values()) or 1
|
||||
overhead = b["statehub_mcp"] + b["task_mgmt"] + b["schema_load"]
|
||||
cost = digest.get("cost", {})
|
||||
tokens = cost.get("input_tokens", 0) + cost.get("output_tokens", 0)
|
||||
return {
|
||||
"infra_overhead_share": overhead / total,
|
||||
"tool_calls": total,
|
||||
"schema_load": b["schema_load"],
|
||||
"error_occurrences": sum(s.get("count", 1) for s in (digest.get("error_snippets") or [])),
|
||||
"has_error": bool(digest.get("error_snippets")),
|
||||
"tokens": tokens,
|
||||
"success": digest.get("outcome") == "success",
|
||||
}
|
||||
|
||||
|
||||
def aggregate(digests: list[dict], *, schema_thrash_threshold: int = 5) -> dict:
|
||||
"""Fleet-level metrics over a set of (already quality-filtered) digests."""
|
||||
per = [session_metrics(d) for d in digests]
|
||||
n = len(per)
|
||||
if n == 0:
|
||||
return {"n_sessions": 0}
|
||||
shares = [m["infra_overhead_share"] for m in per]
|
||||
tokens = [m["tokens"] for m in per]
|
||||
return {
|
||||
"n_sessions": n,
|
||||
"infra_overhead_share_median": _median(shares),
|
||||
"infra_overhead_share_p90": _pct(shares, 0.9),
|
||||
"error_rate": round(sum(m["has_error"] for m in per) / n, 3),
|
||||
"recurring_error_occurrences": sum(m["error_occurrences"] for m in per),
|
||||
"schema_thrash_sessions": sum(1 for m in per if m["schema_load"] >= schema_thrash_threshold),
|
||||
"tokens_p50": _pct(tokens, 0.5),
|
||||
"tokens_p90": _pct(tokens, 0.9),
|
||||
"success_rate": round(sum(m["success"] for m in per) / n, 3),
|
||||
}
|
||||
|
||||
|
||||
def snapshot(digests: list[dict], *, label: str = "") -> dict:
|
||||
m = aggregate(digests)
|
||||
m["captured_at"] = _now()
|
||||
m["label"] = label
|
||||
return m
|
||||
|
||||
|
||||
def save_baseline(metrics: dict, path: str) -> None:
|
||||
"""Append a metrics snapshot to the baseline JSONL trend file."""
|
||||
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
|
||||
with open(path, "a", encoding="utf-8") as fh:
|
||||
fh.write(json.dumps(metrics, sort_keys=True))
|
||||
fh.write("\n")
|
||||
|
||||
|
||||
def load_baselines(path: str) -> list[dict]:
|
||||
if not os.path.exists(path):
|
||||
return []
|
||||
with open(path, encoding="utf-8") as fh:
|
||||
return [json.loads(line) for line in fh if line.strip()]
|
||||
Reference in New Issue
Block a user