generated from coulomb/repo-seed
session-memory: Phase 4 Measure — baseline, effectiveness, trend (WP-0009)
Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate, schema-thrash, token percentiles, success) + persisted baseline trend. effect.py: before/after per-pattern effectiveness with an improved verdict per metric. measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8. 13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
49
tests/test_measure_effect.py
Normal file
49
tests/test_measure_effect.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""Before/after effectiveness tests (WP-0009 T02)."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from session_memory.measure.effect import effectiveness, split_by_date # noqa: E402
|
||||
|
||||
|
||||
def _digest(ts, tools=None, errors=0, outcome="success"):
|
||||
return {
|
||||
"started_at": ts, "outcome": outcome,
|
||||
"cost": {"input_tokens": 100, "output_tokens": 0},
|
||||
"tool_histogram": tools or {"Bash": 10},
|
||||
"error_snippets": [{"fingerprint": f"e{i}", "count": 1} for i in range(errors)],
|
||||
}
|
||||
|
||||
|
||||
def test_split_by_date():
|
||||
digs = [_digest("2026-06-01"), _digest("2026-06-05"), _digest("2026-06-10")]
|
||||
before, after = split_by_date(digs, "2026-06-05")
|
||||
assert len(before) == 1 and len(after) == 2 # >= applied_at goes to after
|
||||
|
||||
|
||||
def test_effectiveness_detects_improvement():
|
||||
# before: lots of errors + hub overhead; after: clean
|
||||
before = [_digest("2026-06-01", tools={"mcp__state-hub__x": 8, "Bash": 2}, errors=3)
|
||||
for _ in range(3)]
|
||||
after = [_digest("2026-06-10", tools={"Bash": 10}, errors=0) for _ in range(3)]
|
||||
e = effectiveness(before + after, "2026-06-05", label="read-before-edit")
|
||||
assert not e["insufficient_data"]
|
||||
assert e["n_before"] == 3 and e["n_after"] == 3
|
||||
assert e["deltas"]["error_rate"]["improved"] is True
|
||||
assert e["deltas"]["infra_overhead_share_median"]["improved"] is True
|
||||
assert e["deltas"]["error_rate"]["change"] < 0
|
||||
|
||||
|
||||
def test_effectiveness_insufficient_data():
|
||||
e = effectiveness([_digest("2026-06-01")], "2026-06-05")
|
||||
assert e["insufficient_data"] is True
|
||||
assert e["deltas"] == {}
|
||||
|
||||
|
||||
def test_success_rate_higher_is_better():
|
||||
before = [_digest("2026-06-01", outcome="fail") for _ in range(2)]
|
||||
after = [_digest("2026-06-10", outcome="success") for _ in range(2)]
|
||||
e = effectiveness(before + after, "2026-06-05")
|
||||
assert e["deltas"]["success_rate"]["improved"] is True
|
||||
Reference in New Issue
Block a user