Files
agentic-resources/session_memory/measure/metrics.py
tegwick 4f28cd67cf session-memory: Phase 4 Measure — baseline, effectiveness, trend (WP-0009)
Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate,
schema-thrash, token percentiles, success) + persisted baseline trend. effect.py:
before/after per-pattern effectiveness with an improved verdict per metric.
measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix
baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8.
13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-07 15:49:22 +02:00

103 lines
3.5 KiB
Python

"""Fleet metrics + persisted baselines (PRD §6.5 FR-M3; T01).
Computes the headline health metrics of the captured corpus — the same quantities
the friction assessment reported — so they can be tracked over time and compared
before/after a change. Reuses :func:`detect.signals.tool_bucket` (WP-0005) and the
digest ``error_snippets`` (WP-0006); no new capture.
A **baseline** is a timestamped metrics snapshot appended to a JSONL file, so
successive runs build a trend the entrypoint (T03) can chart.
"""
from __future__ import annotations
import collections
import json
import os
from datetime import datetime, timezone
from ..detect.signals import tool_bucket
def _now() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def _pct(values: list[float], q: float) -> float:
if not values:
return 0.0
s = sorted(values)
return round(s[int(q * (len(s) - 1))], 3)
def _median(values: list[float]) -> float:
return _pct(values, 0.5)
def _buckets(digest: dict) -> collections.Counter:
b: collections.Counter = collections.Counter()
for tool, n in (digest.get("tool_histogram") or {}).items():
b[tool_bucket(tool)] += n
return b
def session_metrics(digest: dict) -> dict:
"""Per-session metrics used to build fleet aggregates."""
b = _buckets(digest)
total = sum(b.values()) or 1
overhead = b["statehub_mcp"] + b["task_mgmt"] + b["schema_load"]
cost = digest.get("cost", {})
tokens = cost.get("input_tokens", 0) + cost.get("output_tokens", 0)
return {
"infra_overhead_share": overhead / total,
"tool_calls": total,
"schema_load": b["schema_load"],
"error_occurrences": sum(s.get("count", 1) for s in (digest.get("error_snippets") or [])),
"has_error": bool(digest.get("error_snippets")),
"tokens": tokens,
"success": digest.get("outcome") == "success",
}
def aggregate(digests: list[dict], *, schema_thrash_threshold: int = 5) -> dict:
"""Fleet-level metrics over a set of (already quality-filtered) digests."""
per = [session_metrics(d) for d in digests]
n = len(per)
if n == 0:
return {"n_sessions": 0}
shares = [m["infra_overhead_share"] for m in per]
tokens = [m["tokens"] for m in per]
return {
"n_sessions": n,
"infra_overhead_share_median": _median(shares),
"infra_overhead_share_p90": _pct(shares, 0.9),
"error_rate": round(sum(m["has_error"] for m in per) / n, 3),
"recurring_error_occurrences": sum(m["error_occurrences"] for m in per),
"schema_thrash_sessions": sum(1 for m in per if m["schema_load"] >= schema_thrash_threshold),
"tokens_p50": _pct(tokens, 0.5),
"tokens_p90": _pct(tokens, 0.9),
"success_rate": round(sum(m["success"] for m in per) / n, 3),
}
def snapshot(digests: list[dict], *, label: str = "") -> dict:
m = aggregate(digests)
m["captured_at"] = _now()
m["label"] = label
return m
def save_baseline(metrics: dict, path: str) -> None:
"""Append a metrics snapshot to the baseline JSONL trend file."""
os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
with open(path, "a", encoding="utf-8") as fh:
fh.write(json.dumps(metrics, sort_keys=True))
fh.write("\n")
def load_baselines(path: str) -> list[dict]:
if not os.path.exists(path):
return []
with open(path, encoding="utf-8") as fh:
return [json.loads(line) for line in fh if line.strip()]