IB-WP-0019-T04: plan-vs-actual variance and surfacing

After every generate run, compute variance between the executing plan
snapshot and the just-recorded usage rollup, persist it to
output/budget/summary.yaml (overwrite-on-run), and surface it both in
the generate status JSON (new budget_summary field) and as a "Plan
variance" line in reports/generation-summary.md.

Variance fields: calls / prompt_tokens / total_tokens each carry
{estimated, actual, delta, ratio}; cost_usd carries {estimated,
actual_known, actual_estimated_from_rates, actual_total, delta, ratio};
per_workflow rolls the per-bucket usage up to the same workflow_id grain
the plan reports. Runs whose snapshot_id cannot be resolved (no prior
plan, or pruned from the retention window) still record a variance row
with null comparison fields and snapshot_resolved=false, so the
consumer always sees a current summary.

Reordered run_generation so usage and variance are written before the
generation report, allowing the report to embed the variance line on
the same pass.

110 tests pass.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-05-17 20:06:19 +02:00
parent a4dde53fc3
commit d4c9c56f5c
4 changed files with 312 additions and 27 deletions

View File

@@ -382,6 +382,113 @@ def test_record_run_usage_fills_estimated_cost_via_resolver(tmp_path: Path) -> N
assert entry["rollup"]["total_cost_usd_estimated"] == round(0.0009, 6)
def test_record_run_variance_computes_plan_vs_actual(tmp_path: Path) -> None:
root = _build_infospace(tmp_path)
from infospace_bench.budget import record_run_variance
run_entry = {
"run_index": 1,
"snapshot_id": "abc123",
"rollup": {
"total_calls": 10,
"total_prompt_tokens": 1500,
"total_completion_tokens": 500,
"total_tokens": 2000,
"total_cost_usd_known": 0.1,
"total_cost_usd_estimated": 0.05,
},
"per_bucket": [
{"workflow_id": "generic-source-entities", "calls": 6, "prompt_tokens": 1200, "completion_tokens": 400},
{"workflow_id": "generic-source-summary", "calls": 4, "prompt_tokens": 300, "completion_tokens": 100},
],
"duration_seconds": 3.5,
}
# No snapshot persisted yet — variance fields fall back to null
summary = record_run_variance(root, run_entry)
assert summary["snapshot_id"] == "abc123"
assert summary["snapshot_resolved"] is False
assert summary["calls"]["estimated"] is None
assert summary["calls"]["actual"] == 10
assert summary["cost_usd"]["actual_known"] == 0.1
assert summary["cost_usd"]["actual_estimated_from_rates"] == 0.05
assert summary["cost_usd"]["actual_total"] == round(0.15, 6)
def test_record_run_variance_resolves_snapshot_and_computes_ratios(tmp_path: Path) -> None:
from infospace_bench.budget import record_plan_snapshot, record_run_variance
root = _build_infospace(tmp_path)
plan_summary = plan_generation(root, cost_per_1k_tokens=0.5, persist=False)
plan_summary["total_provider_calls_estimate"] = 8
plan_summary["total_prompt_tokens_estimate"] = 1000
plan_summary["estimated_cost_usd"] = 0.5
snapshot_id = record_plan_snapshot(root, plan_summary)
run_entry = {
"run_index": 1,
"snapshot_id": snapshot_id,
"rollup": {
"total_calls": 10,
"total_prompt_tokens": 1500,
"total_completion_tokens": 500,
"total_tokens": 2000,
"total_cost_usd_known": 0.0,
"total_cost_usd_estimated": 0.625,
},
"per_bucket": [],
}
summary = record_run_variance(root, run_entry)
assert summary["snapshot_resolved"] is True
assert summary["calls"]["estimated"] == 8
assert summary["calls"]["actual"] == 10
assert summary["calls"]["delta"] == 2
assert summary["calls"]["ratio"] == 1.25
assert summary["prompt_tokens"]["delta"] == 500
assert summary["cost_usd"]["estimated"] == 0.5
assert summary["cost_usd"]["actual_total"] == 0.625
assert summary["cost_usd"]["delta"] == 0.125
assert summary["cost_usd"]["ratio"] == 1.25
def test_run_generation_persists_variance_and_status_surfaces_it(tmp_path: Path) -> None:
from infospace_bench.budget import SUMMARY_FILE
from infospace_bench.generator import run_generation, status_generation
root = _build_infospace(tmp_path)
fixture = tmp_path / "responses.yaml"
_write_minimal_fixture(fixture)
plan_payload = plan_generation(root)
run_generation(root, fixture_responses=fixture)
status = status_generation(root)
assert (root / SUMMARY_FILE).is_file()
assert status["budget_summary"] is not None
assert status["budget_summary"]["snapshot_id"] == plan_payload["snapshot_id"]
assert status["budget_summary"]["snapshot_resolved"] is True
# Fixture runs report zero known cost; per_workflow variance is keyed by workflow_id
per_workflow = {item["workflow_id"]: item for item in status["budget_summary"]["per_workflow"]}
assert "generic-source-entities" in per_workflow
def test_generation_report_includes_variance_line(tmp_path: Path) -> None:
from infospace_bench.generator import run_generation
root = _build_infospace(tmp_path)
fixture = tmp_path / "responses.yaml"
_write_minimal_fixture(fixture)
plan_generation(root)
run_generation(root, fixture_responses=fixture)
report = (root / "reports" / "generation-summary.md").read_text(encoding="utf-8")
assert "## Plan variance" in report
assert "calls" in report.lower()
def test_plan_cli_writes_snapshot(tmp_path: Path) -> None:
root = _build_infospace(tmp_path)
env = os.environ.copy()