generated from coulomb/repo-seed
IB-WP-0019-T04: plan-vs-actual variance and surfacing
After every generate run, compute variance between the executing plan
snapshot and the just-recorded usage rollup, persist it to
output/budget/summary.yaml (overwrite-on-run), and surface it both in
the generate status JSON (new budget_summary field) and as a "Plan
variance" line in reports/generation-summary.md.
Variance fields: calls / prompt_tokens / total_tokens each carry
{estimated, actual, delta, ratio}; cost_usd carries {estimated,
actual_known, actual_estimated_from_rates, actual_total, delta, ratio};
per_workflow rolls the per-bucket usage up to the same workflow_id grain
the plan reports. Runs whose snapshot_id cannot be resolved (no prior
plan, or pruned from the retention window) still record a variance row
with null comparison fields and snapshot_resolved=false, so the
consumer always sees a current summary.
Reordered run_generation so usage and variance are written before the
generation report, allowing the report to embed the variance line on
the same pass.
110 tests pass.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -382,6 +382,113 @@ def test_record_run_usage_fills_estimated_cost_via_resolver(tmp_path: Path) -> N
|
||||
assert entry["rollup"]["total_cost_usd_estimated"] == round(0.0009, 6)
|
||||
|
||||
|
||||
def test_record_run_variance_computes_plan_vs_actual(tmp_path: Path) -> None:
|
||||
root = _build_infospace(tmp_path)
|
||||
from infospace_bench.budget import record_run_variance
|
||||
|
||||
run_entry = {
|
||||
"run_index": 1,
|
||||
"snapshot_id": "abc123",
|
||||
"rollup": {
|
||||
"total_calls": 10,
|
||||
"total_prompt_tokens": 1500,
|
||||
"total_completion_tokens": 500,
|
||||
"total_tokens": 2000,
|
||||
"total_cost_usd_known": 0.1,
|
||||
"total_cost_usd_estimated": 0.05,
|
||||
},
|
||||
"per_bucket": [
|
||||
{"workflow_id": "generic-source-entities", "calls": 6, "prompt_tokens": 1200, "completion_tokens": 400},
|
||||
{"workflow_id": "generic-source-summary", "calls": 4, "prompt_tokens": 300, "completion_tokens": 100},
|
||||
],
|
||||
"duration_seconds": 3.5,
|
||||
}
|
||||
|
||||
# No snapshot persisted yet — variance fields fall back to null
|
||||
summary = record_run_variance(root, run_entry)
|
||||
|
||||
assert summary["snapshot_id"] == "abc123"
|
||||
assert summary["snapshot_resolved"] is False
|
||||
assert summary["calls"]["estimated"] is None
|
||||
assert summary["calls"]["actual"] == 10
|
||||
assert summary["cost_usd"]["actual_known"] == 0.1
|
||||
assert summary["cost_usd"]["actual_estimated_from_rates"] == 0.05
|
||||
assert summary["cost_usd"]["actual_total"] == round(0.15, 6)
|
||||
|
||||
|
||||
def test_record_run_variance_resolves_snapshot_and_computes_ratios(tmp_path: Path) -> None:
|
||||
from infospace_bench.budget import record_plan_snapshot, record_run_variance
|
||||
|
||||
root = _build_infospace(tmp_path)
|
||||
plan_summary = plan_generation(root, cost_per_1k_tokens=0.5, persist=False)
|
||||
plan_summary["total_provider_calls_estimate"] = 8
|
||||
plan_summary["total_prompt_tokens_estimate"] = 1000
|
||||
plan_summary["estimated_cost_usd"] = 0.5
|
||||
snapshot_id = record_plan_snapshot(root, plan_summary)
|
||||
|
||||
run_entry = {
|
||||
"run_index": 1,
|
||||
"snapshot_id": snapshot_id,
|
||||
"rollup": {
|
||||
"total_calls": 10,
|
||||
"total_prompt_tokens": 1500,
|
||||
"total_completion_tokens": 500,
|
||||
"total_tokens": 2000,
|
||||
"total_cost_usd_known": 0.0,
|
||||
"total_cost_usd_estimated": 0.625,
|
||||
},
|
||||
"per_bucket": [],
|
||||
}
|
||||
|
||||
summary = record_run_variance(root, run_entry)
|
||||
|
||||
assert summary["snapshot_resolved"] is True
|
||||
assert summary["calls"]["estimated"] == 8
|
||||
assert summary["calls"]["actual"] == 10
|
||||
assert summary["calls"]["delta"] == 2
|
||||
assert summary["calls"]["ratio"] == 1.25
|
||||
assert summary["prompt_tokens"]["delta"] == 500
|
||||
assert summary["cost_usd"]["estimated"] == 0.5
|
||||
assert summary["cost_usd"]["actual_total"] == 0.625
|
||||
assert summary["cost_usd"]["delta"] == 0.125
|
||||
assert summary["cost_usd"]["ratio"] == 1.25
|
||||
|
||||
|
||||
def test_run_generation_persists_variance_and_status_surfaces_it(tmp_path: Path) -> None:
|
||||
from infospace_bench.budget import SUMMARY_FILE
|
||||
from infospace_bench.generator import run_generation, status_generation
|
||||
|
||||
root = _build_infospace(tmp_path)
|
||||
fixture = tmp_path / "responses.yaml"
|
||||
_write_minimal_fixture(fixture)
|
||||
plan_payload = plan_generation(root)
|
||||
|
||||
run_generation(root, fixture_responses=fixture)
|
||||
status = status_generation(root)
|
||||
|
||||
assert (root / SUMMARY_FILE).is_file()
|
||||
assert status["budget_summary"] is not None
|
||||
assert status["budget_summary"]["snapshot_id"] == plan_payload["snapshot_id"]
|
||||
assert status["budget_summary"]["snapshot_resolved"] is True
|
||||
# Fixture runs report zero known cost; per_workflow variance is keyed by workflow_id
|
||||
per_workflow = {item["workflow_id"]: item for item in status["budget_summary"]["per_workflow"]}
|
||||
assert "generic-source-entities" in per_workflow
|
||||
|
||||
|
||||
def test_generation_report_includes_variance_line(tmp_path: Path) -> None:
|
||||
from infospace_bench.generator import run_generation
|
||||
|
||||
root = _build_infospace(tmp_path)
|
||||
fixture = tmp_path / "responses.yaml"
|
||||
_write_minimal_fixture(fixture)
|
||||
plan_generation(root)
|
||||
run_generation(root, fixture_responses=fixture)
|
||||
|
||||
report = (root / "reports" / "generation-summary.md").read_text(encoding="utf-8")
|
||||
assert "## Plan variance" in report
|
||||
assert "calls" in report.lower()
|
||||
|
||||
|
||||
def test_plan_cli_writes_snapshot(tmp_path: Path) -> None:
|
||||
root = _build_infospace(tmp_path)
|
||||
env = os.environ.copy()
|
||||
|
||||
Reference in New Issue
Block a user