generated from coulomb/repo-seed
Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate, schema-thrash, token percentiles, success) + persisted baseline trend. effect.py: before/after per-pattern effectiveness with an improved verdict per metric. measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8. 13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
64 lines
2.1 KiB
Python
64 lines
2.1 KiB
Python
"""Fleet metrics + baseline tests (WP-0009 T01)."""
|
|
|
|
import os
|
|
import sys
|
|
|
|
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
|
|
from session_memory.measure.metrics import ( # noqa: E402
|
|
aggregate,
|
|
load_baselines,
|
|
save_baseline,
|
|
session_metrics,
|
|
snapshot,
|
|
)
|
|
|
|
|
|
def _digest(tools=None, errors=0, tokens=100, outcome="success"):
|
|
return {
|
|
"outcome": outcome,
|
|
"cost": {"input_tokens": tokens, "output_tokens": 0},
|
|
"tool_histogram": tools or {"Bash": 10, "Edit": 5},
|
|
"error_snippets": [{"fingerprint": f"e{i}", "count": 1} for i in range(errors)],
|
|
}
|
|
|
|
|
|
def test_session_metrics_overhead_and_errors():
|
|
m = session_metrics(_digest(tools={"mcp__state-hub__create_task": 6, "Bash": 4}, errors=2))
|
|
assert abs(m["infra_overhead_share"] - 0.6) < 1e-9
|
|
assert m["error_occurrences"] == 2
|
|
assert m["has_error"] is True
|
|
|
|
|
|
def test_aggregate_rates_and_percentiles():
|
|
digs = [
|
|
_digest(tools={"mcp__state-hub__x": 8, "Bash": 2}, errors=1, tokens=50), # 80% overhead
|
|
_digest(tools={"Bash": 9, "Edit": 1}, errors=0, tokens=200), # 0% overhead
|
|
_digest(tools={"ToolSearch": 6, "Bash": 4}, errors=0, tokens=100, outcome="fail"),
|
|
]
|
|
a = aggregate(digs)
|
|
assert a["n_sessions"] == 3
|
|
assert a["error_rate"] == round(1 / 3, 3)
|
|
assert a["success_rate"] == round(2 / 3, 3)
|
|
assert a["schema_thrash_sessions"] == 1 # the ToolSearch=6 session
|
|
assert 0 <= a["infra_overhead_share_median"] <= 1
|
|
|
|
|
|
def test_aggregate_empty():
|
|
assert aggregate([]) == {"n_sessions": 0}
|
|
|
|
|
|
def test_snapshot_has_timestamp_and_label():
|
|
s = snapshot([_digest()], label="baseline")
|
|
assert s["label"] == "baseline"
|
|
assert "captured_at" in s and s["n_sessions"] == 1
|
|
|
|
|
|
def test_baseline_roundtrip_appends(tmp_path):
|
|
path = str(tmp_path / "baselines.jsonl")
|
|
save_baseline(snapshot([_digest()], label="a"), path)
|
|
save_baseline(snapshot([_digest(), _digest()], label="b"), path)
|
|
rows = load_baselines(path)
|
|
assert [r["label"] for r in rows] == ["a", "b"]
|
|
assert rows[1]["n_sessions"] == 2
|