agentic-resources/session_memory/measure/metrics.py

"""Fleet metrics + persisted baselines (PRD §6.5 FR-M3; T01).

Computes the headline health metrics of the captured corpus — the same quantities
the friction assessment reported — so they can be tracked over time and compared
before/after a change. Reuses :func:`detect.signals.tool_bucket` (WP-0005) and the
digest ``error_snippets`` (WP-0006); no new capture.

A **baseline** is a timestamped metrics snapshot appended to a JSONL file, so
successive runs build a trend the entrypoint (T03) can chart.
"""

from __future__ import annotations

import collections
import json
import os
from datetime import datetime, timezone

from ..detect.signals import tool_bucket


def _now() -> str:
    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")


def _pct(values: list[float], q: float) -> float:
    if not values:
        return 0.0
    s = sorted(values)
    return round(s[int(q * (len(s) - 1))], 3)


def _median(values: list[float]) -> float:
    return _pct(values, 0.5)


def _buckets(digest: dict) -> collections.Counter:
    b: collections.Counter = collections.Counter()
    for tool, n in (digest.get("tool_histogram") or {}).items():
        b[tool_bucket(tool)] += n
    return b


def session_metrics(digest: dict) -> dict:
    """Per-session metrics used to build fleet aggregates."""
    b = _buckets(digest)
    total = sum(b.values()) or 1
    overhead = b["statehub_mcp"] + b["task_mgmt"] + b["schema_load"]
    cost = digest.get("cost", {})
    tokens = cost.get("input_tokens", 0) + cost.get("output_tokens", 0)
    return {
        "infra_overhead_share": overhead / total,
        "tool_calls": total,
        "schema_load": b["schema_load"],
        "error_occurrences": sum(s.get("count", 1) for s in (digest.get("error_snippets") or [])),
        "has_error": bool(digest.get("error_snippets")),
        "tokens": tokens,
        "success": digest.get("outcome") == "success",
    }


def aggregate(digests: list[dict], *, schema_thrash_threshold: int = 5) -> dict:
    """Fleet-level metrics over a set of (already quality-filtered) digests."""
    per = [session_metrics(d) for d in digests]
    n = len(per)
    if n == 0:
        return {"n_sessions": 0}
    shares = [m["infra_overhead_share"] for m in per]
    tokens = [m["tokens"] for m in per]
    return {
        "n_sessions": n,
        "infra_overhead_share_median": _median(shares),
        "infra_overhead_share_p90": _pct(shares, 0.9),
        "error_rate": round(sum(m["has_error"] for m in per) / n, 3),
        "recurring_error_occurrences": sum(m["error_occurrences"] for m in per),
        "schema_thrash_sessions": sum(1 for m in per if m["schema_load"] >= schema_thrash_threshold),
        "tokens_p50": _pct(tokens, 0.5),
        "tokens_p90": _pct(tokens, 0.9),
        "success_rate": round(sum(m["success"] for m in per) / n, 3),
    }


def snapshot(digests: list[dict], *, label: str = "") -> dict:
    m = aggregate(digests)
    m["captured_at"] = _now()
    m["label"] = label
    return m


def save_baseline(metrics: dict, path: str) -> None:
    """Append a metrics snapshot to the baseline JSONL trend file."""
    os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
    with open(path, "a", encoding="utf-8") as fh:
        fh.write(json.dumps(metrics, sort_keys=True))
        fh.write("\n")


def load_baselines(path: str) -> list[dict]:
    if not os.path.exists(path):
        return []
    with open(path, encoding="utf-8") as fh:
        return [json.loads(line) for line in fh if line.strip()]