generated from coulomb/repo-seed
session-memory: Phase 4 Measure — baseline, effectiveness, trend (WP-0009)
Closes the loop. metrics.py: fleet metrics (infra-overhead share, error rate, schema-thrash, token percentiles, success) + persisted baseline trend. effect.py: before/after per-pattern effectiveness with an improved verdict per metric. measure entrypoint with trend + --since effectiveness + JSON. Recorded pre-fix baseline: 27 sessions, overhead median 11.7%, error rate 0.96, schema-thrash 8. 13 new tests; suite 139/139. Capture->Detect->Curate->Distribute->Measure complete. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
101
session_memory/measure/__main__.py
Normal file
101
session_memory/measure/__main__.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""Measure entrypoint (T03): fleet trend + per-pattern effectiveness.
|
||||
|
||||
python -m session_memory.measure [--config PATH] [--label L] [--since DATE]
|
||||
[--no-save] [--json]
|
||||
|
||||
Computes current fleet metrics over the real (quality-filtered) sessions, appends
|
||||
them to the baseline trend, and reports whether the fleet is getting cheaper /
|
||||
more reliable over time (FR-M3). With ``--since DATE`` it also reports before/after
|
||||
effectiveness around a change (FR-M1/FR-M2).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
|
||||
from ..core.store import Store
|
||||
from ..detect.quality import filter_real, quality_config
|
||||
from ..ingest import _expand, load_config
|
||||
from .effect import effectiveness
|
||||
from .metrics import load_baselines, save_baseline, snapshot
|
||||
|
||||
_TREND_KEYS = ("infra_overhead_share_median", "error_rate", "schema_thrash_sessions",
|
||||
"tokens_p50", "success_rate")
|
||||
|
||||
|
||||
def real_digests(config: dict) -> list[dict]:
|
||||
s = config.get("store", {})
|
||||
store = Store(_expand(s["db_path"]), _expand(s["blob_dir"]))
|
||||
out = filter_real(store.list_digests(), quality_config(config))
|
||||
store.close()
|
||||
return out
|
||||
|
||||
|
||||
def _fmt_trend(baselines: list[dict]) -> str:
|
||||
if not baselines:
|
||||
return " (no prior snapshots)"
|
||||
lines = []
|
||||
recent = baselines[-5:]
|
||||
for b in recent:
|
||||
when = (b.get("captured_at") or "")[:10]
|
||||
lbl = f" {b['label']}" if b.get("label") else ""
|
||||
lines.append(f" {when}{lbl}: overhead_med={b.get('infra_overhead_share_median')} "
|
||||
f"err_rate={b.get('error_rate')} schema_thrash={b.get('schema_thrash_sessions')} "
|
||||
f"tok_p50={b.get('tokens_p50')} success={b.get('success_rate')} "
|
||||
f"(n={b.get('n_sessions')})")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _report(current: dict, baselines: list[dict], eff: dict | None) -> str:
|
||||
lines = [f"# Fleet metrics (n={current.get('n_sessions')} real sessions)"]
|
||||
for k in _TREND_KEYS:
|
||||
lines.append(f" {k} = {current.get(k)}")
|
||||
lines.append("\n## Trend (recent snapshots)")
|
||||
lines.append(_fmt_trend(baselines))
|
||||
if eff is not None:
|
||||
lines.append(f"\n## Effectiveness since {eff['applied_at']} "
|
||||
f"(before={eff['n_before']}, after={eff['n_after']})")
|
||||
if eff["insufficient_data"]:
|
||||
lines.append(" insufficient data on one side of the date")
|
||||
else:
|
||||
for k in _TREND_KEYS:
|
||||
d = eff["deltas"].get(k, {})
|
||||
mark = {True: "improved", False: "worse", None: "—"}[d.get("improved")]
|
||||
lines.append(f" {k}: {d.get('before')} -> {d.get('after')} "
|
||||
f"({d.get('change'):+}) {mark}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def main(argv=None) -> int:
|
||||
here = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
ap = argparse.ArgumentParser(description="Measure fleet metrics + per-pattern effectiveness.")
|
||||
ap.add_argument("--config", default=os.path.join(here, "config.toml"))
|
||||
ap.add_argument("--label", default="")
|
||||
ap.add_argument("--since", default=None, help="ISO date for before/after effectiveness")
|
||||
ap.add_argument("--no-save", action="store_true", help="don't append to the baseline trend")
|
||||
ap.add_argument("--json", action="store_true")
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
config = load_config(args.config)
|
||||
digests = real_digests(config)
|
||||
current = snapshot(digests, label=args.label)
|
||||
|
||||
path = _expand(config.get("measure", {}).get("baselines", "session_memory/measure/baselines.jsonl"))
|
||||
prior = load_baselines(path)
|
||||
if not args.no_save:
|
||||
save_baseline(current, path)
|
||||
|
||||
eff = effectiveness(digests, args.since, label=args.label) if args.since else None
|
||||
|
||||
if args.json:
|
||||
print(json.dumps({"current": current, "trend": prior + [current], "effectiveness": eff},
|
||||
indent=2))
|
||||
else:
|
||||
print(_report(current, prior + [current], eff))
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user