IB-WP-0019-T02: usage rollup from run records

Every completed generate run now aggregates per-call adapter usage from the workflow-engine run records into output/budget/usage.yaml. Per-call data is bucketed by (workflow_id, stage_id, provider, model) with running totals for calls, prompt_tokens, completion_tokens, total_tokens, and cost_usd_known (sum of adapter-reported cost when the provider returns it; usually zero today). A run-level entry captures run_index, started_at, completed_at, duration_seconds, the executing plan snapshot_id (resolved from the latest plans.yaml entry), and the workflow-level run_id / stage_count summaries. cost_usd_estimated is left as None for this task; T03 wires the rate-table resolver so the same bucket gets a model-priced fallback when the adapter does not return cost directly. Fixture-mode runs are recorded with provider='fixture', zero tokens, and cost_status='unknown' rather than silently skipped, so the rollup honestly reflects which stages actually ran. 102 tests pass. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-17 19:46:40 +02:00
parent 37bbaf9fab
commit 678508226a
4 changed files with 315 additions and 2 deletions
--- a/tests/test_budget_registry.py
+++ b/tests/test_budget_registry.py
@@ -61,6 +61,46 @@ def _write_three_chapter_epub(path: Path) -> None:
            )


+def _write_minimal_fixture(path: Path) -> None:
+    data = {
+        "responses": [
+            {
+                "stage_id": "summarize-source",
+                "input_artifact_id": "*",
+                "markdown": "# Source Summary\n\nA stub summary.\n",
+            },
+            {
+                "stage_id": "extract-entities",
+                "input_artifact_id": "*",
+                "markdown": (
+                    "# Stub Entity\n\n## Definition\n\nA stub.\n\n## Context\n\nFor a budget test.\n"
+                ),
+            },
+            {
+                "stage_id": "extract-relations",
+                "input_artifact_id": "*",
+                "markdown": (
+                    "# Stub Entity Practices Something\n\n## Subject\n\nStub Entity\n\n"
+                    "## Predicate\n\npractices\n\n## Object\n\nSomething\n\n## Relation Type\n\nsupport\n\n"
+                    "## Evidence\n\nA stub.\n"
+                ),
+            },
+            {
+                "stage_id": "evaluate-entity",
+                "input_artifact_id": "*",
+                "markdown": (
+                    "---\nartifact_id: entity/stub-entity.md\nevaluator: fixture\n"
+                    "evaluated_at: '2026-05-17T00:00:00'\n"
+                    "scores:\n  - name: groundedness\n    value: 4.0\n    max_value: 5.0\n"
+                    "  - name: usefulness\n    value: 4.0\n    max_value: 5.0\n---\n\n"
+                    "# Evaluation: entity/stub-entity.md\n"
+                ),
+            },
+        ]
+    }
+    path.write_text(yaml.safe_dump(data, sort_keys=False), encoding="utf-8")
+
+
 def _build_infospace(tmp_path: Path) -> Path:
    book = tmp_path / "book.epub"
    _write_three_chapter_epub(book)
@@ -144,6 +184,103 @@ def test_plan_snapshot_retention_prunes_old_entries(tmp_path: Path) -> None:
    assert data["pruned_count"] >= 1


+def test_record_run_usage_aggregates_by_workflow_stage_provider_model(tmp_path: Path) -> None:
+    root = _build_infospace(tmp_path)
+    from infospace_bench.budget import record_run_usage, read_usage_runs
+
+    workflow_results = [
+        {
+            "run_id": "run-1",
+            "workflow_id": "generic-source-entities",
+            "status": "completed",
+            "stages": [
+                {
+                    "stage_id": "extract-entities",
+                    "provider": "openrouter",
+                    "metadata": {
+                        "model": "openai/gpt-4o-mini",
+                        "usage": {"prompt_tokens": 1000, "completion_tokens": 200, "total_tokens": 1200},
+                    },
+                },
+                {
+                    "stage_id": "extract-entities",
+                    "provider": "openrouter",
+                    "metadata": {
+                        "model": "openai/gpt-4o-mini",
+                        "usage": {"prompt_tokens": 800, "completion_tokens": 150, "cost": 0.0012},
+                    },
+                },
+                {"stage_id": "split-entities", "message": "split 3 entities"},
+            ],
+        }
+    ]
+
+    entry = record_run_usage(root, workflow_results, snapshot_id="abc123", duration_seconds=4.2)
+
+    assert entry["rollup"]["total_calls"] == 2
+    assert entry["rollup"]["total_prompt_tokens"] == 1800
+    assert entry["rollup"]["total_completion_tokens"] == 350
+    assert entry["rollup"]["total_cost_usd_known"] == 0.0012
+    assert entry["snapshot_id"] == "abc123"
+    assert entry["duration_seconds"] == 4.2
+    assert len(entry["per_bucket"]) == 1
+    bucket = entry["per_bucket"][0]
+    assert bucket["workflow_id"] == "generic-source-entities"
+    assert bucket["stage_id"] == "extract-entities"
+    assert bucket["provider"] == "openrouter"
+    assert bucket["model"] == "openai/gpt-4o-mini"
+    assert bucket["calls"] == 2
+
+    runs = read_usage_runs(root)
+    assert len(runs) == 1
+    assert runs[0]["run_index"] == 1
+
+
+def test_record_run_usage_handles_fixture_runs_without_aborting(tmp_path: Path) -> None:
+    root = _build_infospace(tmp_path)
+    from infospace_bench.budget import record_run_usage
+
+    workflow_results = [
+        {
+            "run_id": "fix-1",
+            "workflow_id": "generic-source-summary",
+            "stages": [
+                {"stage_id": "summarize-source", "provider": "fixture"},
+                {"stage_id": "summarize-source", "provider": "fixture"},
+            ],
+        }
+    ]
+
+    entry = record_run_usage(root, workflow_results)
+
+    fixture_bucket = next(b for b in entry["per_bucket"] if b["provider"] == "fixture")
+    assert fixture_bucket["calls"] == 2
+    assert fixture_bucket["prompt_tokens"] == 0
+    assert fixture_bucket["cost_status"] == "unknown"
+    assert entry["rollup"]["total_cost_usd_known"] == 0.0
+
+
+def test_run_generation_writes_usage_yaml_with_plan_snapshot_id(tmp_path: Path) -> None:
+    root = _build_infospace(tmp_path)
+    from infospace_bench.budget import USAGE_FILE, read_usage_runs
+    from infospace_bench.generator import run_generation
+
+    fixture = tmp_path / "responses.yaml"
+    _write_minimal_fixture(fixture)
+
+    plan_payload = plan_generation(root)
+    run_generation(root, fixture_responses=fixture)
+
+    runs = read_usage_runs(root)
+    assert (root / USAGE_FILE).is_file()
+    assert len(runs) == 1
+    assert runs[0]["snapshot_id"] == plan_payload["snapshot_id"]
+    assert runs[0]["duration_seconds"] is not None and runs[0]["duration_seconds"] >= 0
+    assert runs[0]["rollup"]["total_calls"] >= 0
+    # Fixture mode runs should not claim any known cost
+    assert runs[0]["rollup"]["total_cost_usd_known"] == 0.0
+
+
 def test_plan_cli_writes_snapshot(tmp_path: Path) -> None:
    root = _build_infospace(tmp_path)
    env = os.environ.copy()