session-memory: Phase 4 Measure — baseline, effectiveness, trend (WP-0009)

Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate, schema-thrash, token percentiles, success) + persisted baseline trend. effect.py: before/after per-pattern effectiveness with an improved verdict per metric. measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8. 13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-07 15:49:22 +02:00
parent 035c7a20d3
commit 4f28cd67cf
11 changed files with 497 additions and 5 deletions
--- a/session_memory/measure/metrics.py
+++ b/session_memory/measure/metrics.py
@@ -0,0 +1,102 @@
+"""Fleet metrics + persisted baselines (PRD §6.5 FR-M3; T01).
+
+Computes the headline health metrics of the captured corpus — the same quantities
+the friction assessment reported — so they can be tracked over time and compared
+before/after a change. Reuses :func:`detect.signals.tool_bucket` (WP-0005) and the
+digest ``error_snippets`` (WP-0006); no new capture.
+
+A **baseline** is a timestamped metrics snapshot appended to a JSONL file, so
+successive runs build a trend the entrypoint (T03) can chart.
+"""
+
+from __future__ import annotations
+
+import collections
+import json
+import os
+from datetime import datetime, timezone
+
+from ..detect.signals import tool_bucket
+
+
+def _now() -> str:
+    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+
+def _pct(values: list[float], q: float) -> float:
+    if not values:
+        return 0.0
+    s = sorted(values)
+    return round(s[int(q * (len(s) - 1))], 3)
+
+
+def _median(values: list[float]) -> float:
+    return _pct(values, 0.5)
+
+
+def _buckets(digest: dict) -> collections.Counter:
+    b: collections.Counter = collections.Counter()
+    for tool, n in (digest.get("tool_histogram") or {}).items():
+        b[tool_bucket(tool)] += n
+    return b
+
+
+def session_metrics(digest: dict) -> dict:
+    """Per-session metrics used to build fleet aggregates."""
+    b = _buckets(digest)
+    total = sum(b.values()) or 1
+    overhead = b["statehub_mcp"] + b["task_mgmt"] + b["schema_load"]
+    cost = digest.get("cost", {})
+    tokens = cost.get("input_tokens", 0) + cost.get("output_tokens", 0)
+    return {
+        "infra_overhead_share": overhead / total,
+        "tool_calls": total,
+        "schema_load": b["schema_load"],
+        "error_occurrences": sum(s.get("count", 1) for s in (digest.get("error_snippets") or [])),
+        "has_error": bool(digest.get("error_snippets")),
+        "tokens": tokens,
+        "success": digest.get("outcome") == "success",
+    }
+
+
+def aggregate(digests: list[dict], *, schema_thrash_threshold: int = 5) -> dict:
+    """Fleet-level metrics over a set of (already quality-filtered) digests."""
+    per = [session_metrics(d) for d in digests]
+    n = len(per)
+    if n == 0:
+        return {"n_sessions": 0}
+    shares = [m["infra_overhead_share"] for m in per]
+    tokens = [m["tokens"] for m in per]
+    return {
+        "n_sessions": n,
+        "infra_overhead_share_median": _median(shares),
+        "infra_overhead_share_p90": _pct(shares, 0.9),
+        "error_rate": round(sum(m["has_error"] for m in per) / n, 3),
+        "recurring_error_occurrences": sum(m["error_occurrences"] for m in per),
+        "schema_thrash_sessions": sum(1 for m in per if m["schema_load"] >= schema_thrash_threshold),
+        "tokens_p50": _pct(tokens, 0.5),
+        "tokens_p90": _pct(tokens, 0.9),
+        "success_rate": round(sum(m["success"] for m in per) / n, 3),
+    }
+
+
+def snapshot(digests: list[dict], *, label: str = "") -> dict:
+    m = aggregate(digests)
+    m["captured_at"] = _now()
+    m["label"] = label
+    return m
+
+
+def save_baseline(metrics: dict, path: str) -> None:
+    """Append a metrics snapshot to the baseline JSONL trend file."""
+    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
+    with open(path, "a", encoding="utf-8") as fh:
+        fh.write(json.dumps(metrics, sort_keys=True))
+        fh.write("\n")
+
+
+def load_baselines(path: str) -> list[dict]:
+    if not os.path.exists(path):
+        return []
+    with open(path, encoding="utf-8") as fh:
+        return [json.loads(line) for line in fh if line.strip()]