generated from coulomb/repo-seed
IB-WP-0020-T05: shadow-mode CLI flags; close IB-WP-0020
Add --shadow-baseline <id> and --shadow-rate <float> opt-in flags to
generate run, generate resume, and generate from-source. When
--shadow-baseline names a candidate id from the routing config,
build_routing_policy_from_config wraps every other candidate in an
llm-connect ShadowingAdapter using that baseline plus a
PairedGrader(ExactMatchJudge()) and the workspace-resolved
QualityLedger. The baseline candidate itself is never wrapped — that
would shadow it against itself. --shadow-rate defaults to 0.1 when
--shadow-baseline is set; passing --shadow-rate without
--shadow-baseline fails fast with shadow_rate_without_baseline.
Setting --shadow-baseline without a ledger_path in the config fails
with missing_routing_ledger_for_shadow so observations have a place to
land before any call goes out.
run_generation grew shadow_baseline + shadow_rate kwargs and
_adapter_for("routing", ...) plumbs them into
build_routing_policy_from_config. The wrapped ShadowingAdapter slots
into the policy's prefer/fallback per task type via a
(candidate_id, task_type) reverse lookup, and adapters_by_id on the
adaptive policy gets the string-keyed entries.
Five new tests cover: shadow_rate without baseline fails fast, shadow
mode without a ledger fails fast, unknown shadow baseline id fails
fast, structural assertion that ShadowingAdapter wraps non-baseline
candidates and leaves the baseline raw, and a behavioural check that
shadow_rate=1.0 calls the baseline on every call while shadow_rate=0.0
skips entirely. Test forces async_shadow=False so the call counter is
deterministic.
Closes IB-WP-0020: T01-T05 all done. Workplan status flips from active
to finished. 179 tests pass, 2 skipped (both live OpenRouter smokes).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -463,6 +463,187 @@ def test_build_routing_policy_honours_custom_api_key_env() -> None:
|
||||
assert isinstance(policy.rules[0].prefer, OpenRouterAdapter)
|
||||
|
||||
|
||||
def test_shadow_rate_without_baseline_fails_fast() -> None:
|
||||
from infospace_bench.routing_config import build_routing_policy_from_config
|
||||
|
||||
config = parse_routing_config(MINIMAL)
|
||||
with pytest.raises(InfospaceError) as exc_info:
|
||||
build_routing_policy_from_config(
|
||||
config,
|
||||
shadow_rate=0.5,
|
||||
adapter_factory=_fake_adapter_factory_record([]),
|
||||
)
|
||||
assert exc_info.value.code == "shadow_rate_without_baseline"
|
||||
|
||||
|
||||
def test_shadow_baseline_without_ledger_path_fails_fast() -> None:
|
||||
"""ShadowingAdapter needs a place to write observations; require ledger_path."""
|
||||
from infospace_bench.routing_config import build_routing_policy_from_config
|
||||
|
||||
config = parse_routing_config(MINIMAL)
|
||||
with pytest.raises(InfospaceError) as exc_info:
|
||||
build_routing_policy_from_config(
|
||||
config,
|
||||
shadow_baseline_id="openrouter:gpt-4o-mini",
|
||||
adapter_factory=_fake_adapter_factory_record([]),
|
||||
)
|
||||
assert exc_info.value.code == "missing_routing_ledger_for_shadow"
|
||||
|
||||
|
||||
def test_shadow_baseline_not_in_config_fails_fast(tmp_path: Path) -> None:
|
||||
from infospace_bench.routing_config import build_routing_policy_from_config
|
||||
|
||||
data = {**MINIMAL, "ledger_path": "quality.jsonl"}
|
||||
config = parse_routing_config(data)
|
||||
with pytest.raises(InfospaceError) as exc_info:
|
||||
build_routing_policy_from_config(
|
||||
config,
|
||||
workspace=tmp_path,
|
||||
shadow_baseline_id="not-in-config",
|
||||
adapter_factory=_fake_adapter_factory_record([]),
|
||||
)
|
||||
assert exc_info.value.code == "missing_shadow_baseline"
|
||||
|
||||
|
||||
def test_shadow_wraps_candidates_excluding_baseline(tmp_path: Path) -> None:
|
||||
from llm_connect.adapter import LLMAdapter
|
||||
from llm_connect.models import LLMResponse, RunConfig
|
||||
from llm_connect.shadowing import ShadowingAdapter
|
||||
from infospace_bench.routing_config import build_routing_policy_from_config
|
||||
|
||||
data = {
|
||||
"schema_version": 1,
|
||||
"ledger_path": "quality.jsonl",
|
||||
"task_types": {
|
||||
"extract-entities": {
|
||||
"candidates": [
|
||||
{"id": "candidate-a", "provider": "openrouter", "model": "openai/gpt-4o-mini"},
|
||||
{"id": "baseline-x", "provider": "claude_code", "model": "claude-opus-4-7"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
config = parse_routing_config(data)
|
||||
|
||||
class _Stub(LLMAdapter):
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.calls = 0
|
||||
|
||||
def execute_prompt(self, prompt, config):
|
||||
self.calls += 1
|
||||
return LLMResponse(content="match", model=self.name, usage={"prompt_tokens": 1, "completion_tokens": 1})
|
||||
|
||||
def validate_config(self, config):
|
||||
return True
|
||||
|
||||
stubs: dict[str, _Stub] = {}
|
||||
|
||||
def factory(candidate, env):
|
||||
stubs[candidate.id] = _Stub(candidate.id)
|
||||
return stubs[candidate.id]
|
||||
|
||||
policy = build_routing_policy_from_config(
|
||||
config,
|
||||
workspace=tmp_path,
|
||||
adapter_factory=factory,
|
||||
shadow_baseline_id="baseline-x",
|
||||
shadow_rate=1.0,
|
||||
)
|
||||
|
||||
rule = policy.rules[0]
|
||||
# The prefer slot is now a ShadowingAdapter wrapping candidate-a.
|
||||
assert isinstance(rule.prefer, ShadowingAdapter)
|
||||
assert rule.prefer.candidate_adapter is stubs["candidate-a"]
|
||||
assert rule.prefer.baseline_adapter is stubs["baseline-x"]
|
||||
assert rule.prefer.task_type == "extract-entities"
|
||||
# The baseline candidate (fallback) is NOT wrapped.
|
||||
assert rule.fallback is stubs["baseline-x"]
|
||||
|
||||
|
||||
def test_shadow_rate_one_fires_per_call_and_zero_skips(tmp_path: Path) -> None:
|
||||
"""ShadowingAdapter is best-effort and supplied by llm-connect.
|
||||
|
||||
Spot-check the wiring: at rate=1.0 the baseline.execute_prompt runs on
|
||||
every call; at rate=0.0 it never runs.
|
||||
"""
|
||||
from llm_connect.adapter import LLMAdapter
|
||||
from llm_connect.models import LLMResponse, RunConfig
|
||||
from infospace_bench.routing_config import build_routing_policy_from_config
|
||||
|
||||
data = {
|
||||
"schema_version": 1,
|
||||
"ledger_path": "quality.jsonl",
|
||||
"task_types": {
|
||||
"extract-entities": {
|
||||
"candidates": [
|
||||
{"id": "candidate-a", "provider": "openrouter", "model": "openai/gpt-4o-mini"},
|
||||
{"id": "baseline-x", "provider": "claude_code", "model": "claude-opus-4-7"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
config = parse_routing_config(data)
|
||||
|
||||
class _Counter(LLMAdapter):
|
||||
def __init__(self, name):
|
||||
self.name = name
|
||||
self.calls = 0
|
||||
|
||||
def execute_prompt(self, prompt, config):
|
||||
self.calls += 1
|
||||
return LLMResponse(content="match", model=self.name, usage={"prompt_tokens": 1, "completion_tokens": 1})
|
||||
|
||||
def validate_config(self, config):
|
||||
return True
|
||||
|
||||
def make_factory():
|
||||
stubs: dict[str, _Counter] = {}
|
||||
|
||||
def factory(candidate, env):
|
||||
stubs[candidate.id] = _Counter(candidate.id)
|
||||
return stubs[candidate.id]
|
||||
|
||||
return factory, stubs
|
||||
|
||||
factory, stubs = make_factory()
|
||||
policy_full = build_routing_policy_from_config(
|
||||
config,
|
||||
workspace=tmp_path,
|
||||
adapter_factory=factory,
|
||||
shadow_baseline_id="baseline-x",
|
||||
shadow_rate=1.0,
|
||||
)
|
||||
# Drive the prefer adapter (synchronous shadow) and force any
|
||||
# background shadow work to drain before we count calls.
|
||||
shadow_adapter = policy_full.rules[0].prefer
|
||||
shadow_adapter.async_shadow = False # force sync grading for a deterministic count
|
||||
for _ in range(3):
|
||||
shadow_adapter.execute_prompt("hello", RunConfig(model_name="x"))
|
||||
assert stubs["candidate-a"].calls == 3
|
||||
assert stubs["baseline-x"].calls == 3, "rate=1.0 should call baseline on every call"
|
||||
|
||||
# Fresh factory + stubs for the zero-rate run so counters reset.
|
||||
factory2, stubs2 = make_factory()
|
||||
# Use a unique ledger path so the two policies do not share state.
|
||||
(tmp_path / "subdir").mkdir(exist_ok=True)
|
||||
data2 = {**data, "ledger_path": "subdir/quality.jsonl"}
|
||||
config2 = parse_routing_config(data2)
|
||||
policy_zero = build_routing_policy_from_config(
|
||||
config2,
|
||||
workspace=tmp_path,
|
||||
adapter_factory=factory2,
|
||||
shadow_baseline_id="baseline-x",
|
||||
shadow_rate=0.0,
|
||||
)
|
||||
shadow_adapter2 = policy_zero.rules[0].prefer
|
||||
shadow_adapter2.async_shadow = False
|
||||
for _ in range(3):
|
||||
shadow_adapter2.execute_prompt("hello", RunConfig(model_name="x"))
|
||||
assert stubs2["candidate-a"].calls == 3
|
||||
assert stubs2["baseline-x"].calls == 0, "rate=0.0 should skip baseline entirely"
|
||||
|
||||
|
||||
def test_rejects_non_string_ledger_path() -> None:
|
||||
payload = {
|
||||
"schema_version": 1,
|
||||
|
||||
Reference in New Issue
Block a user