agentic-resources/tests/test_measure_metrics.py

"""Fleet metrics + baseline tests (WP-0009 T01)."""

import os
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from session_memory.measure.metrics import (  # noqa: E402
    aggregate,
    load_baselines,
    save_baseline,
    session_metrics,
    snapshot,
)


def _digest(tools=None, errors=0, tokens=100, outcome="success"):
    return {
        "outcome": outcome,
        "cost": {"input_tokens": tokens, "output_tokens": 0},
        "tool_histogram": tools or {"Bash": 10, "Edit": 5},
        "error_snippets": [{"fingerprint": f"e{i}", "count": 1} for i in range(errors)],
    }


def test_session_metrics_overhead_and_errors():
    m = session_metrics(_digest(tools={"mcp__state-hub__create_task": 6, "Bash": 4}, errors=2))
    assert abs(m["infra_overhead_share"] - 0.6) < 1e-9
    assert m["error_occurrences"] == 2
    assert m["has_error"] is True


def test_aggregate_rates_and_percentiles():
    digs = [
        _digest(tools={"mcp__state-hub__x": 8, "Bash": 2}, errors=1, tokens=50),   # 80% overhead
        _digest(tools={"Bash": 9, "Edit": 1}, errors=0, tokens=200),               # 0% overhead
        _digest(tools={"ToolSearch": 6, "Bash": 4}, errors=0, tokens=100, outcome="fail"),
    ]
    a = aggregate(digs)
    assert a["n_sessions"] == 3
    assert a["error_rate"] == round(1 / 3, 3)
    assert a["success_rate"] == round(2 / 3, 3)
    assert a["schema_thrash_sessions"] == 1   # the ToolSearch=6 session
    assert 0 <= a["infra_overhead_share_median"] <= 1


def test_aggregate_empty():
    assert aggregate([]) == {"n_sessions": 0}


def test_snapshot_has_timestamp_and_label():
    s = snapshot([_digest()], label="baseline")
    assert s["label"] == "baseline"
    assert "captured_at" in s and s["n_sessions"] == 1


def test_baseline_roundtrip_appends(tmp_path):
    path = str(tmp_path / "baselines.jsonl")
    save_baseline(snapshot([_digest()], label="a"), path)
    save_baseline(snapshot([_digest(), _digest()], label="b"), path)
    rows = load_baselines(path)
    assert [r["label"] for r in rows] == ["a", "b"]
    assert rows[1]["n_sessions"] == 2