IB-WP-0019-T04: plan-vs-actual variance and surfacing

After every generate run, compute variance between the executing plan snapshot and the just-recorded usage rollup, persist it to output/budget/summary.yaml (overwrite-on-run), and surface it both in the generate status JSON (new budget_summary field) and as a "Plan variance" line in reports/generation-summary.md. Variance fields: calls / prompt_tokens / total_tokens each carry {estimated, actual, delta, ratio}; cost_usd carries {estimated, actual_known, actual_estimated_from_rates, actual_total, delta, ratio}; per_workflow rolls the per-bucket usage up to the same workflow_id grain the plan reports. Runs whose snapshot_id cannot be resolved (no prior plan, or pruned from the retention window) still record a variance row with null comparison fields and snapshot_resolved=false, so the consumer always sees a current summary. Reordered run_generation so usage and variance are written before the generation report, allowing the report to embed the variance line on the same pass. 110 tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 20:06:19 +02:00
parent a4dde53fc3
commit d4c9c56f5c
4 changed files with 312 additions and 27 deletions
--- a/tests/test_budget_registry.py
+++ b/tests/test_budget_registry.py
@@ -382,6 +382,113 @@ def test_record_run_usage_fills_estimated_cost_via_resolver(tmp_path: Path) -> N
    assert entry["rollup"]["total_cost_usd_estimated"] == round(0.0009, 6)


+def test_record_run_variance_computes_plan_vs_actual(tmp_path: Path) -> None:
+    root = _build_infospace(tmp_path)
+    from infospace_bench.budget import record_run_variance
+
+    run_entry = {
+        "run_index": 1,
+        "snapshot_id": "abc123",
+        "rollup": {
+            "total_calls": 10,
+            "total_prompt_tokens": 1500,
+            "total_completion_tokens": 500,
+            "total_tokens": 2000,
+            "total_cost_usd_known": 0.1,
+            "total_cost_usd_estimated": 0.05,
+        },
+        "per_bucket": [
+            {"workflow_id": "generic-source-entities", "calls": 6, "prompt_tokens": 1200, "completion_tokens": 400},
+            {"workflow_id": "generic-source-summary", "calls": 4, "prompt_tokens": 300, "completion_tokens": 100},
+        ],
+        "duration_seconds": 3.5,
+    }
+
+    # No snapshot persisted yet — variance fields fall back to null
+    summary = record_run_variance(root, run_entry)
+
+    assert summary["snapshot_id"] == "abc123"
+    assert summary["snapshot_resolved"] is False
+    assert summary["calls"]["estimated"] is None
+    assert summary["calls"]["actual"] == 10
+    assert summary["cost_usd"]["actual_known"] == 0.1
+    assert summary["cost_usd"]["actual_estimated_from_rates"] == 0.05
+    assert summary["cost_usd"]["actual_total"] == round(0.15, 6)
+
+
+def test_record_run_variance_resolves_snapshot_and_computes_ratios(tmp_path: Path) -> None:
+    from infospace_bench.budget import record_plan_snapshot, record_run_variance
+
+    root = _build_infospace(tmp_path)
+    plan_summary = plan_generation(root, cost_per_1k_tokens=0.5, persist=False)
+    plan_summary["total_provider_calls_estimate"] = 8
+    plan_summary["total_prompt_tokens_estimate"] = 1000
+    plan_summary["estimated_cost_usd"] = 0.5
+    snapshot_id = record_plan_snapshot(root, plan_summary)
+
+    run_entry = {
+        "run_index": 1,
+        "snapshot_id": snapshot_id,
+        "rollup": {
+            "total_calls": 10,
+            "total_prompt_tokens": 1500,
+            "total_completion_tokens": 500,
+            "total_tokens": 2000,
+            "total_cost_usd_known": 0.0,
+            "total_cost_usd_estimated": 0.625,
+        },
+        "per_bucket": [],
+    }
+
+    summary = record_run_variance(root, run_entry)
+
+    assert summary["snapshot_resolved"] is True
+    assert summary["calls"]["estimated"] == 8
+    assert summary["calls"]["actual"] == 10
+    assert summary["calls"]["delta"] == 2
+    assert summary["calls"]["ratio"] == 1.25
+    assert summary["prompt_tokens"]["delta"] == 500
+    assert summary["cost_usd"]["estimated"] == 0.5
+    assert summary["cost_usd"]["actual_total"] == 0.625
+    assert summary["cost_usd"]["delta"] == 0.125
+    assert summary["cost_usd"]["ratio"] == 1.25
+
+
+def test_run_generation_persists_variance_and_status_surfaces_it(tmp_path: Path) -> None:
+    from infospace_bench.budget import SUMMARY_FILE
+    from infospace_bench.generator import run_generation, status_generation
+
+    root = _build_infospace(tmp_path)
+    fixture = tmp_path / "responses.yaml"
+    _write_minimal_fixture(fixture)
+    plan_payload = plan_generation(root)
+
+    run_generation(root, fixture_responses=fixture)
+    status = status_generation(root)
+
+    assert (root / SUMMARY_FILE).is_file()
+    assert status["budget_summary"] is not None
+    assert status["budget_summary"]["snapshot_id"] == plan_payload["snapshot_id"]
+    assert status["budget_summary"]["snapshot_resolved"] is True
+    # Fixture runs report zero known cost; per_workflow variance is keyed by workflow_id
+    per_workflow = {item["workflow_id"]: item for item in status["budget_summary"]["per_workflow"]}
+    assert "generic-source-entities" in per_workflow
+
+
+def test_generation_report_includes_variance_line(tmp_path: Path) -> None:
+    from infospace_bench.generator import run_generation
+
+    root = _build_infospace(tmp_path)
+    fixture = tmp_path / "responses.yaml"
+    _write_minimal_fixture(fixture)
+    plan_generation(root)
+    run_generation(root, fixture_responses=fixture)
+
+    report = (root / "reports" / "generation-summary.md").read_text(encoding="utf-8")
+    assert "## Plan variance" in report
+    assert "calls" in report.lower()
+
+
 def test_plan_cli_writes_snapshot(tmp_path: Path) -> None:
    root = _build_infospace(tmp_path)
    env = os.environ.copy()