generated from coulomb/repo-seed
Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate, schema-thrash, token percentiles, success) + persisted baseline trend. effect.py: before/after per-pattern effectiveness with an improved verdict per metric. measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8. 13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
61 lines
2.2 KiB
Python
61 lines
2.2 KiB
Python
"""Before/after per-pattern effectiveness (PRD §6.5 FR-M1/FR-M2; T02).
|
|
|
|
Given a change/pattern with an ``applied_at`` date, split sessions into *before*
|
|
and *after* by their start time, aggregate each side, and diff the headline
|
|
metrics — so we can say whether a distributed pattern (e.g. the Read-before-Edit
|
|
reflex, or the State Hub skill) actually moved the numbers, and retire it if not.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from .metrics import aggregate
|
|
|
|
# Metrics where a *lower* value after the change means improvement.
|
|
_LOWER_IS_BETTER = {
|
|
"infra_overhead_share_median", "infra_overhead_share_p90", "error_rate",
|
|
"recurring_error_occurrences", "schema_thrash_sessions", "tokens_p50", "tokens_p90",
|
|
}
|
|
# Metrics where a *higher* value is improvement.
|
|
_HIGHER_IS_BETTER = {"success_rate"}
|
|
|
|
|
|
def split_by_date(digests: list[dict], applied_at: str) -> tuple[list[dict], list[dict]]:
|
|
"""Partition digests into (before, after) by ``started_at`` vs ``applied_at``."""
|
|
before, after = [], []
|
|
for d in digests:
|
|
ts = d.get("started_at") or ""
|
|
(after if ts and ts >= applied_at else before).append(d)
|
|
return before, after
|
|
|
|
|
|
def _delta(metric: str, before: float, after: float) -> dict:
|
|
change = round(after - before, 3)
|
|
if metric in _LOWER_IS_BETTER:
|
|
improved = change < 0
|
|
elif metric in _HIGHER_IS_BETTER:
|
|
improved = change > 0
|
|
else:
|
|
improved = None
|
|
return {"before": before, "after": after, "change": change, "improved": improved}
|
|
|
|
|
|
def effectiveness(digests: list[dict], applied_at: str, *, label: str = "") -> dict:
|
|
"""Compare fleet metrics after ``applied_at`` against the prior period."""
|
|
before, after = split_by_date(digests, applied_at)
|
|
b_agg, a_agg = aggregate(before), aggregate(after)
|
|
metrics = (_LOWER_IS_BETTER | _HIGHER_IS_BETTER)
|
|
deltas = {}
|
|
if before and after:
|
|
for m in metrics:
|
|
deltas[m] = _delta(m, b_agg.get(m, 0.0), a_agg.get(m, 0.0))
|
|
return {
|
|
"label": label,
|
|
"applied_at": applied_at,
|
|
"n_before": len(before),
|
|
"n_after": len(after),
|
|
"before": b_agg,
|
|
"after": a_agg,
|
|
"deltas": deltas,
|
|
"insufficient_data": not (before and after),
|
|
}
|