diff --git a/docs/generic-source-generator.md b/docs/generic-source-generator.md index f71b58d..dbb09a3 100644 --- a/docs/generic-source-generator.md +++ b/docs/generic-source-generator.md @@ -94,6 +94,42 @@ skipped unless both `OPENROUTER_API_KEY` and chapter through the same path and asserts the provider metadata plumb-through. +### Live runs with `--provider routing` + +When the routing CLI is what you want to exercise live, swap +`--provider openrouter --model ...` for the routing pair: + +```bash +infospace-bench generate from-source ./LEFEVRE.epub \ + --workspace ./infospaces \ + --slug reminiscences-routed \ + --name "Reminiscences (Routed)" \ + --profile trading-literature \ + --provider routing \ + --routing-config ./examples/routing/trading-literature.yaml \ + --chapter I \ + --apply +``` + +`examples/routing/trading-literature.yaml` is a checked-in starting +config: cheap candidates for summary/evaluation, smart candidates for +entity/relation, a `claude_code` baseline rule for future shadow +sampling, and a workspace-relative `output/routing/quality.jsonl` +ledger so adaptive observations stay with the workspace. + +`--quality-floor ` on the same command overrides the config's +`default_quality_floor` for a single invocation — useful for +tightening the bar for a specific run without editing the file. The +ledger fills up as the `AdaptiveRoutingPolicy` records each +observation; later runs against the same workspace get the benefit +without re-grading from scratch. + +The parallel live-smoke test +(`test_provider_routing_one_chapter_live_smoke`) is also gated on +`INFOSPACE_BENCH_ENABLE_LIVE_OPENROUTER=1` + `OPENROUTER_API_KEY` and +asserts the per-stage adapter-choices report section names the routed +model. + ### Budget and usage registry Every `generate plan` invocation appends a compact snapshot to diff --git a/examples/routing/trading-literature.yaml b/examples/routing/trading-literature.yaml new file mode 100644 index 0000000..b0c469e --- /dev/null +++ b/examples/routing/trading-literature.yaml @@ -0,0 +1,81 @@ +# Example routing config for a trading-literature Lefevre-style run. +# +# Captures the IB-WP-0018 task-type taxonomy from docs/routing-task-types.md: +# summarize-source → cheap model (volume-heavy, recoverable downstream) +# extract-entities → smart model (durable output; be strict) +# extract-relations → smart model (depends on entities) +# evaluate-entity → judge model (different family from extraction) +# synthesize-report → smart model (volume-of-one, quality matters, cheap) +# +# Quality floors are the recommended starting points from +# docs/routing-task-types.md. With a ledger configured, AdaptiveRoutingPolicy +# will pick the cheapest *qualifying* adapter per task type as observations +# accumulate; until then it falls back to the static prefer/fallback order. +# +# Refresh the model rates in src/infospace_bench/model_rates.yaml before any +# full-book run — list prices drift, and the rough USD estimate in the budget +# log depends on them. + +schema_version: 1 + +# Workspace-relative ledger so QualityLedger observations from this workspace +# stay with this workspace. Drop this line to run pure static routing. +ledger_path: output/routing/quality.jsonl + +# Floors apply when --quality-floor is not passed at the call site. The CLI +# flag wins, then the per-task quality_floor below, then this default. +default_quality_floor: 0.80 + +stage_to_task_type: + summarize-source: cheap + extract-entities: smart + extract-relations: smart + evaluate-entity: judge + synthesize-report: smart + +task_types: + + cheap: + quality_floor: 0.70 + candidates: + - id: openrouter:gpt-4o-mini + provider: openrouter + model: openai/gpt-4o-mini + api_key_env: OPENROUTER_API_KEY + max_cost_per_1k: 0.001 + - id: openrouter:claude-3.5-haiku + provider: openrouter + model: anthropic/claude-3.5-haiku + api_key_env: OPENROUTER_API_KEY + max_cost_per_1k: 0.003 + + smart: + quality_floor: 0.85 + candidates: + - id: openrouter:claude-3.5-haiku + provider: openrouter + model: anthropic/claude-3.5-haiku + api_key_env: OPENROUTER_API_KEY + - id: openrouter:claude-3.5-sonnet + provider: openrouter + model: anthropic/claude-3.5-sonnet + api_key_env: OPENROUTER_API_KEY + + judge: + quality_floor: 0.80 + candidates: + # Evaluation goes through a different family than extraction to limit + # self-preference bias. + - id: openrouter:gpt-4o-mini + provider: openrouter + model: openai/gpt-4o-mini + api_key_env: OPENROUTER_API_KEY + + # Baseline is wired here so a follow-up T05 ShadowingAdapter step can + # reference `claude-code` as the grading oracle without editing the + # task_types stanza. + baseline: + candidates: + - id: claude-code + provider: claude_code + model: claude-opus-4-7 diff --git a/tests/test_openrouter_live.py b/tests/test_openrouter_live.py index a8c0f97..060c7cf 100644 --- a/tests/test_openrouter_live.py +++ b/tests/test_openrouter_live.py @@ -208,3 +208,87 @@ def test_openrouter_one_chapter_smoke(tmp_path: Path) -> None: and item.get("provenance", {}).get("provider_metadata", {}).get("request_id") ] assert generated_with_metadata, "generated artifacts should carry provider_metadata.request_id" + + +_LIVE_ROUTING_REASON = ( + "set INFOSPACE_BENCH_ENABLE_LIVE_OPENROUTER=1 and OPENROUTER_API_KEY to run " + "the optional one-chapter routing smoke against OpenRouter" +) + + +@pytest.mark.skipif(not (_LIVE_OPT_IN and _LIVE_API_KEY), reason=_LIVE_ROUTING_REASON) +def test_provider_routing_one_chapter_live_smoke(tmp_path: Path) -> None: + """Live smoke: one chapter through --provider routing against OpenRouter. + + Uses a minimal one-candidate-per-task-type routing config so the test + spends roughly the same as the static OpenRouter smoke. Asserts the run + completes, the routing bridge recorded adapter_id / task_type on + provider_metadata, and the per-stage adapter-choices report section + reflects routed choices. + """ + book = _build_fixture_epub(tmp_path / "lefevre.epub") + model = os.environ.get("INFOSPACE_BENCH_LIVE_MODEL", "openai/gpt-4o-mini") + + routing_config = tmp_path / "routing.yaml" + routing_config.write_text( + yaml.safe_dump( + { + "schema_version": 1, + "stage_to_task_type": { + "summarize-source": "cheap", + "extract-entities": "cheap", + "extract-relations": "cheap", + "evaluate-entity": "cheap", + "synthesize-report": "cheap", + }, + "task_types": { + "cheap": { + "candidates": [ + { + "id": f"openrouter:{model}", + "provider": "openrouter", + "model": model, + "api_key_env": "OPENROUTER_API_KEY", + }, + ], + }, + }, + }, + sort_keys=False, + ), + encoding="utf-8", + ) + + infospace = init_generation_infospace( + tmp_path, + book, + "lefevre-live-routing", + name="Lefevre Live Routing", + profile="trading-literature", + chapter_filter=["I"], + ) + plan_generation(infospace.root, cost_per_1k_tokens=0.5) + result = run_generation( + infospace.root, + provider="routing", + routing_config=routing_config, + ) + status = status_generation(infospace.root) + + assert result.status == "completed" + assert status["source_chunk_count"] == 1 + assert status["entity_count"] >= 1 + + report = (infospace.root / "reports" / "generation-summary.md").read_text(encoding="utf-8") + assert "## Per-stage adapter choices" in report + assert model in report, "report should name the routed model" + + # The routing bridge writes adapter_id + task_type onto provider_metadata. + index = yaml.safe_load((infospace.root / "artifacts" / "index.yaml").read_text(encoding="utf-8")) + routed_artifacts = [ + item + for item in index["artifacts"] + if item["kind"] in {"entity", "relation", "generated"} + and (item.get("provenance") or {}).get("provider_metadata", {}).get("adapter_id") + ] + assert routed_artifacts, "routed artifacts must carry adapter_id provenance" diff --git a/tests/test_routing_config.py b/tests/test_routing_config.py index 69e2ddc..62e505a 100644 --- a/tests/test_routing_config.py +++ b/tests/test_routing_config.py @@ -412,6 +412,25 @@ def test_build_routing_policy_claude_code_needs_no_api_key() -> None: assert isinstance(policy.rules[0].prefer, ClaudeCodeAdapter) +def test_example_trading_literature_config_parses() -> None: + """Regression: the shipped example config must parse cleanly.""" + from infospace_bench.routing_config import load_routing_config + + example_path = Path(__file__).resolve().parent.parent / "examples" / "routing" / "trading-literature.yaml" + + config = load_routing_config(example_path) + + task_type_names = {task.task_type for task in config.task_types} + assert {"cheap", "smart", "judge", "baseline"} <= task_type_names + assert config.default_quality_floor == 0.80 + # Each shipped stage maps to a task type the config actually declares. + for stage, task_type in config.stage_to_task_type.items(): + assert task_type in task_type_names, f"stage {stage!r} maps to undeclared task type {task_type!r}" + # baseline is included so a T05 ShadowingAdapter wiring can reference it. + baseline = next(t for t in config.task_types if t.task_type == "baseline") + assert baseline.candidates[0].provider == "claude_code" + + def test_build_routing_policy_honours_custom_api_key_env() -> None: from infospace_bench.routing_config import build_routing_policy_from_config from llm_connect.openrouter import OpenRouterAdapter diff --git a/workplans/IB-WP-0020-provider-routing-cli.md b/workplans/IB-WP-0020-provider-routing-cli.md index f7f8fac..670fc01 100644 --- a/workplans/IB-WP-0020-provider-routing-cli.md +++ b/workplans/IB-WP-0020-provider-routing-cli.md @@ -138,7 +138,7 @@ state_hub_task_id: "fe5888e0-da33-413a-b026-71ed811b8c73" ```task id: IB-WP-0020-T04 -status: todo +status: done priority: medium state_hub_task_id: "69288131-f265-4db5-a4b0-b0c8a6f55dd8" ```