"""Fleet metrics + persisted baselines (PRD §6.5 FR-M3; T01). Computes the headline health metrics of the captured corpus — the same quantities the friction assessment reported — so they can be tracked over time and compared before/after a change. Reuses :func:`detect.signals.tool_bucket` (WP-0005) and the digest ``error_snippets`` (WP-0006); no new capture. A **baseline** is a timestamped metrics snapshot appended to a JSONL file, so successive runs build a trend the entrypoint (T03) can chart. """ from __future__ import annotations import collections import json import os from datetime import datetime, timezone from ..detect.signals import tool_bucket def _now() -> str: return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") def _pct(values: list[float], q: float) -> float: if not values: return 0.0 s = sorted(values) return round(s[int(q * (len(s) - 1))], 3) def _median(values: list[float]) -> float: return _pct(values, 0.5) def _buckets(digest: dict) -> collections.Counter: b: collections.Counter = collections.Counter() for tool, n in (digest.get("tool_histogram") or {}).items(): b[tool_bucket(tool)] += n return b def session_metrics(digest: dict) -> dict: """Per-session metrics used to build fleet aggregates.""" b = _buckets(digest) total = sum(b.values()) or 1 overhead = b["statehub_mcp"] + b["task_mgmt"] + b["schema_load"] cost = digest.get("cost", {}) tokens = cost.get("input_tokens", 0) + cost.get("output_tokens", 0) return { "infra_overhead_share": overhead / total, "tool_calls": total, "schema_load": b["schema_load"], "error_occurrences": sum(s.get("count", 1) for s in (digest.get("error_snippets") or [])), "has_error": bool(digest.get("error_snippets")), "tokens": tokens, "success": digest.get("outcome") == "success", } def aggregate(digests: list[dict], *, schema_thrash_threshold: int = 5) -> dict: """Fleet-level metrics over a set of (already quality-filtered) digests.""" per = [session_metrics(d) for d in digests] n = len(per) if n == 0: return {"n_sessions": 0} shares = [m["infra_overhead_share"] for m in per] tokens = [m["tokens"] for m in per] return { "n_sessions": n, "infra_overhead_share_median": _median(shares), "infra_overhead_share_p90": _pct(shares, 0.9), "error_rate": round(sum(m["has_error"] for m in per) / n, 3), "recurring_error_occurrences": sum(m["error_occurrences"] for m in per), "schema_thrash_sessions": sum(1 for m in per if m["schema_load"] >= schema_thrash_threshold), "tokens_p50": _pct(tokens, 0.5), "tokens_p90": _pct(tokens, 0.9), "success_rate": round(sum(m["success"] for m in per) / n, 3), } def snapshot(digests: list[dict], *, label: str = "") -> dict: m = aggregate(digests) m["captured_at"] = _now() m["label"] = label return m def save_baseline(metrics: dict, path: str) -> None: """Append a metrics snapshot to the baseline JSONL trend file.""" os.makedirs(os.path.dirname(path) or ".", exist_ok=True) with open(path, "a", encoding="utf-8") as fh: fh.write(json.dumps(metrics, sort_keys=True)) fh.write("\n") def load_baselines(path: str) -> list[dict]: if not os.path.exists(path): return [] with open(path, encoding="utf-8") as fh: return [json.loads(line) for line in fh if line.strip()]