generated from coulomb/repo-seed
session-memory: Phase 4 Measure — baseline, effectiveness, trend (WP-0009)
Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate, schema-thrash, token percentiles, success) + persisted baseline trend. effect.py: before/after per-pattern effectiveness with an improved verdict per metric. measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8. 13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
60
session_memory/measure/effect.py
Normal file
60
session_memory/measure/effect.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""Before/after per-pattern effectiveness (PRD §6.5 FR-M1/FR-M2; T02).
|
||||
|
||||
Given a change/pattern with an ``applied_at`` date, split sessions into *before*
|
||||
and *after* by their start time, aggregate each side, and diff the headline
|
||||
metrics — so we can say whether a distributed pattern (e.g. the Read-before-Edit
|
||||
reflex, or the State Hub skill) actually moved the numbers, and retire it if not.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .metrics import aggregate
|
||||
|
||||
# Metrics where a *lower* value after the change means improvement.
|
||||
_LOWER_IS_BETTER = {
|
||||
"infra_overhead_share_median", "infra_overhead_share_p90", "error_rate",
|
||||
"recurring_error_occurrences", "schema_thrash_sessions", "tokens_p50", "tokens_p90",
|
||||
}
|
||||
# Metrics where a *higher* value is improvement.
|
||||
_HIGHER_IS_BETTER = {"success_rate"}
|
||||
|
||||
|
||||
def split_by_date(digests: list[dict], applied_at: str) -> tuple[list[dict], list[dict]]:
|
||||
"""Partition digests into (before, after) by ``started_at`` vs ``applied_at``."""
|
||||
before, after = [], []
|
||||
for d in digests:
|
||||
ts = d.get("started_at") or ""
|
||||
(after if ts and ts >= applied_at else before).append(d)
|
||||
return before, after
|
||||
|
||||
|
||||
def _delta(metric: str, before: float, after: float) -> dict:
|
||||
change = round(after - before, 3)
|
||||
if metric in _LOWER_IS_BETTER:
|
||||
improved = change < 0
|
||||
elif metric in _HIGHER_IS_BETTER:
|
||||
improved = change > 0
|
||||
else:
|
||||
improved = None
|
||||
return {"before": before, "after": after, "change": change, "improved": improved}
|
||||
|
||||
|
||||
def effectiveness(digests: list[dict], applied_at: str, *, label: str = "") -> dict:
|
||||
"""Compare fleet metrics after ``applied_at`` against the prior period."""
|
||||
before, after = split_by_date(digests, applied_at)
|
||||
b_agg, a_agg = aggregate(before), aggregate(after)
|
||||
metrics = (_LOWER_IS_BETTER | _HIGHER_IS_BETTER)
|
||||
deltas = {}
|
||||
if before and after:
|
||||
for m in metrics:
|
||||
deltas[m] = _delta(m, b_agg.get(m, 0.0), a_agg.get(m, 0.0))
|
||||
return {
|
||||
"label": label,
|
||||
"applied_at": applied_at,
|
||||
"n_before": len(before),
|
||||
"n_after": len(after),
|
||||
"before": b_agg,
|
||||
"after": a_agg,
|
||||
"deltas": deltas,
|
||||
"insufficient_data": not (before and after),
|
||||
}
|
||||
Reference in New Issue
Block a user