generated from coulomb/repo-seed
IB-WP-0020-T05: shadow-mode CLI flags; close IB-WP-0020
Add --shadow-baseline <id> and --shadow-rate <float> opt-in flags to
generate run, generate resume, and generate from-source. When
--shadow-baseline names a candidate id from the routing config,
build_routing_policy_from_config wraps every other candidate in an
llm-connect ShadowingAdapter using that baseline plus a
PairedGrader(ExactMatchJudge()) and the workspace-resolved
QualityLedger. The baseline candidate itself is never wrapped — that
would shadow it against itself. --shadow-rate defaults to 0.1 when
--shadow-baseline is set; passing --shadow-rate without
--shadow-baseline fails fast with shadow_rate_without_baseline.
Setting --shadow-baseline without a ledger_path in the config fails
with missing_routing_ledger_for_shadow so observations have a place to
land before any call goes out.
run_generation grew shadow_baseline + shadow_rate kwargs and
_adapter_for("routing", ...) plumbs them into
build_routing_policy_from_config. The wrapped ShadowingAdapter slots
into the policy's prefer/fallback per task type via a
(candidate_id, task_type) reverse lookup, and adapters_by_id on the
adaptive policy gets the string-keyed entries.
Five new tests cover: shadow_rate without baseline fails fast, shadow
mode without a ledger fails fast, unknown shadow baseline id fails
fast, structural assertion that ShadowingAdapter wraps non-baseline
candidates and leaves the baseline raw, and a behavioural check that
shadow_rate=1.0 calls the baseline on every call while shadow_rate=0.0
skips entirely. Test forces async_shadow=False so the call counter is
deterministic.
Closes IB-WP-0020: T01-T05 all done. Workplan status flips from active
to finished. 179 tests pass, 2 skipped (both live OpenRouter smokes).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -208,6 +208,8 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
generate_run.add_argument("--fixture-responses", default="")
|
||||
generate_run.add_argument("--routing-config", default="", help="YAML routing config (required with --provider routing)")
|
||||
generate_run.add_argument("--quality-floor", type=float, default=None, help="Override the config's default_quality_floor for this run")
|
||||
generate_run.add_argument("--shadow-baseline", default="", help="Candidate id from the routing config to use as the shadow-grading baseline")
|
||||
generate_run.add_argument("--shadow-rate", type=float, default=None, help="Shadow sampling rate 0..1 (default 0.1 when --shadow-baseline is set)")
|
||||
generate_run.add_argument("--resume", action="store_true")
|
||||
generate_run.add_argument("--force", action="store_true")
|
||||
|
||||
@@ -222,6 +224,8 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
generate_resume.add_argument("--fixture-responses", default="")
|
||||
generate_resume.add_argument("--routing-config", default="")
|
||||
generate_resume.add_argument("--quality-floor", type=float, default=None)
|
||||
generate_resume.add_argument("--shadow-baseline", default="")
|
||||
generate_resume.add_argument("--shadow-rate", type=float, default=None)
|
||||
generate_resume.add_argument("--force", action="store_true")
|
||||
|
||||
generate_status = generate_sub.add_parser(
|
||||
@@ -245,6 +249,8 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
generate_from_source.add_argument("--fixture-responses", default="")
|
||||
generate_from_source.add_argument("--routing-config", default="", help="YAML routing config (required with --provider routing)")
|
||||
generate_from_source.add_argument("--quality-floor", type=float, default=None)
|
||||
generate_from_source.add_argument("--shadow-baseline", default="")
|
||||
generate_from_source.add_argument("--shadow-rate", type=float, default=None)
|
||||
generate_from_source.add_argument("--max-chunks", type=int, default=0)
|
||||
generate_from_source.add_argument(
|
||||
"--chapter",
|
||||
@@ -559,6 +565,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
fixture_responses=args.fixture_responses or None,
|
||||
routing_config=args.routing_config or None,
|
||||
quality_floor=args.quality_floor,
|
||||
shadow_baseline=args.shadow_baseline or None,
|
||||
shadow_rate=args.shadow_rate,
|
||||
resume=args.resume,
|
||||
force=args.force,
|
||||
).to_dict()
|
||||
@@ -573,6 +581,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
fixture_responses=args.fixture_responses or None,
|
||||
routing_config=args.routing_config or None,
|
||||
quality_floor=args.quality_floor,
|
||||
shadow_baseline=args.shadow_baseline or None,
|
||||
shadow_rate=args.shadow_rate,
|
||||
resume=True,
|
||||
force=args.force,
|
||||
).to_dict()
|
||||
@@ -601,6 +611,8 @@ def main(argv: list[str] | None = None) -> int:
|
||||
fixture_responses=args.fixture_responses or None,
|
||||
routing_config=args.routing_config or None,
|
||||
quality_floor=args.quality_floor,
|
||||
shadow_baseline=args.shadow_baseline or None,
|
||||
shadow_rate=args.shadow_rate,
|
||||
)
|
||||
_write_json(result.to_dict())
|
||||
else:
|
||||
|
||||
@@ -429,6 +429,8 @@ def run_generation(
|
||||
fixture_responses: str | Path | None = None,
|
||||
routing_config: str | Path | None = None,
|
||||
quality_floor: float | None = None,
|
||||
shadow_baseline: str | None = None,
|
||||
shadow_rate: float | None = None,
|
||||
resume: bool = False,
|
||||
force: bool = False,
|
||||
) -> GenerationRunResult:
|
||||
@@ -457,6 +459,8 @@ def run_generation(
|
||||
fixture_responses=fixture_responses,
|
||||
routing_config=routing_config,
|
||||
quality_floor=quality_floor,
|
||||
shadow_baseline=shadow_baseline,
|
||||
shadow_rate=shadow_rate,
|
||||
workspace=_workspace_for(root_path),
|
||||
)
|
||||
if workflow_ids
|
||||
@@ -562,6 +566,8 @@ def _adapter_for(
|
||||
fixture_responses: str | Path | None,
|
||||
routing_config: str | Path | None = None,
|
||||
quality_floor: float | None = None,
|
||||
shadow_baseline: str | None = None,
|
||||
shadow_rate: float | None = None,
|
||||
workspace: Path | None = None,
|
||||
) -> AssistedGenerationAdapter:
|
||||
if fixture_responses:
|
||||
@@ -582,7 +588,12 @@ def _adapter_for(
|
||||
)
|
||||
|
||||
config = load_routing_config(routing_config)
|
||||
policy = build_routing_policy_from_config(config, workspace=workspace)
|
||||
policy = build_routing_policy_from_config(
|
||||
config,
|
||||
workspace=workspace,
|
||||
shadow_baseline_id=shadow_baseline,
|
||||
shadow_rate=shadow_rate,
|
||||
)
|
||||
effective_floor = (
|
||||
quality_floor
|
||||
if quality_floor is not None
|
||||
|
||||
@@ -287,6 +287,8 @@ def build_routing_policy_from_config(
|
||||
workspace: str | Path | None = None,
|
||||
env: Mapping[str, str] | None = None,
|
||||
adapter_factory: AdapterFactory | None = None,
|
||||
shadow_baseline_id: str | None = None,
|
||||
shadow_rate: float | None = None,
|
||||
) -> Any:
|
||||
"""Materialise a parsed config into a live llm-connect routing policy.
|
||||
|
||||
@@ -302,20 +304,92 @@ def build_routing_policy_from_config(
|
||||
|
||||
Fails fast (before any network call) when a candidate's required API
|
||||
key env var is missing from ``env``.
|
||||
|
||||
When ``shadow_baseline_id`` is set, every non-baseline candidate is
|
||||
wrapped in an llm-connect ``ShadowingAdapter`` using the named
|
||||
baseline candidate plus a PairedGrader(ExactMatchJudge()) and the
|
||||
QualityLedger from ``config.ledger_path``. ``shadow_rate`` controls
|
||||
the sampling fraction (defaults to 0.1). The baseline candidate
|
||||
itself is never wrapped — that would shadow it against itself.
|
||||
"""
|
||||
from llm_connect.routing import AdaptiveRoutingPolicy, RoutingPolicy, RoutingRule
|
||||
|
||||
environment: Mapping[str, str] = env if env is not None else os.environ
|
||||
factory: AdapterFactory = adapter_factory or _default_adapter_factory
|
||||
|
||||
if shadow_rate is not None and shadow_baseline_id is None:
|
||||
raise InfospaceError(
|
||||
"shadow_rate_without_baseline",
|
||||
"shadow_rate requires shadow_baseline_id; pass --shadow-baseline with --shadow-rate",
|
||||
{"shadow_rate": shadow_rate},
|
||||
)
|
||||
|
||||
use_adaptive = (
|
||||
config.default_quality_floor is not None
|
||||
or any(task.quality_floor is not None for task in config.task_types)
|
||||
or config.ledger_path is not None
|
||||
or shadow_baseline_id is not None
|
||||
)
|
||||
|
||||
ledger = _resolve_ledger(config, workspace, required=shadow_baseline_id is not None)
|
||||
|
||||
raw_adapters: dict[str, Any] = {}
|
||||
for task in config.task_types:
|
||||
for candidate in task.candidates:
|
||||
if candidate.id not in raw_adapters:
|
||||
raw_adapters[candidate.id] = factory(candidate, environment)
|
||||
|
||||
baseline_adapter = None
|
||||
if shadow_baseline_id is not None:
|
||||
if shadow_baseline_id not in raw_adapters:
|
||||
raise InfospaceError(
|
||||
"missing_shadow_baseline",
|
||||
f"shadow_baseline_id {shadow_baseline_id!r} not declared as a candidate in the routing config",
|
||||
{"shadow_baseline_id": shadow_baseline_id},
|
||||
)
|
||||
baseline_adapter = raw_adapters[shadow_baseline_id]
|
||||
|
||||
adapters_by_id: dict[str, Any] = {}
|
||||
if shadow_baseline_id is None:
|
||||
adapters_by_id = dict(raw_adapters)
|
||||
else:
|
||||
# Wrap each candidate (per task) in a ShadowingAdapter unless it *is* the baseline.
|
||||
from .routing import wrap_with_shadow_sampling
|
||||
from llm_connect.grading import ExactMatchJudge, PairedGrader
|
||||
|
||||
assert ledger is not None # _resolve_ledger raised if required and missing
|
||||
grader = PairedGrader(judge=ExactMatchJudge())
|
||||
effective_rate = shadow_rate if shadow_rate is not None else 0.1
|
||||
for task in config.task_types:
|
||||
for candidate in task.candidates:
|
||||
key = (candidate.id, task.task_type)
|
||||
if candidate.id == shadow_baseline_id:
|
||||
adapters_by_id[candidate.id] = raw_adapters[candidate.id]
|
||||
continue
|
||||
# One ShadowingAdapter per (candidate, task_type) pair so the
|
||||
# task_type tagged on observations matches the rule it serves.
|
||||
shadow_id = f"shadow:{candidate.id}@{task.task_type}"
|
||||
adapters_by_id[shadow_id] = wrap_with_shadow_sampling(
|
||||
candidate=raw_adapters[candidate.id],
|
||||
baseline=baseline_adapter,
|
||||
grader=grader,
|
||||
ledger=ledger,
|
||||
task_type=task.task_type,
|
||||
adapter_id=candidate.id,
|
||||
baseline_adapter_id=shadow_baseline_id,
|
||||
shadow_rate=effective_rate,
|
||||
async_shadow=True,
|
||||
)
|
||||
adapters_by_id[key] = adapters_by_id[shadow_id] # task-keyed reverse lookup
|
||||
|
||||
rules: list[RoutingRule] = []
|
||||
for task in config.task_types:
|
||||
candidates: list[Any] = []
|
||||
candidates = []
|
||||
for candidate in task.candidates:
|
||||
if candidate.id not in adapters_by_id:
|
||||
adapters_by_id[candidate.id] = factory(candidate, environment)
|
||||
candidates.append(adapters_by_id[candidate.id])
|
||||
if shadow_baseline_id is not None and candidate.id != shadow_baseline_id:
|
||||
candidates.append(adapters_by_id[(candidate.id, task.task_type)])
|
||||
else:
|
||||
candidates.append(adapters_by_id[candidate.id])
|
||||
prefer = candidates[0]
|
||||
prefer_candidate = task.candidates[0]
|
||||
fallback = candidates[1] if len(candidates) > 1 else None
|
||||
@@ -328,30 +402,38 @@ def build_routing_policy_from_config(
|
||||
)
|
||||
)
|
||||
|
||||
use_adaptive = (
|
||||
config.default_quality_floor is not None
|
||||
or any(task.quality_floor is not None for task in config.task_types)
|
||||
or config.ledger_path is not None
|
||||
)
|
||||
if not use_adaptive:
|
||||
return RoutingPolicy(rules=rules)
|
||||
|
||||
from llm_connect.quality import QualityLedger
|
||||
|
||||
ledger: QualityLedger | None = None
|
||||
if config.ledger_path:
|
||||
ledger_path = Path(config.ledger_path)
|
||||
if not ledger_path.is_absolute() and workspace is not None:
|
||||
ledger_path = Path(workspace) / ledger_path
|
||||
ledger_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
ledger = QualityLedger(path=ledger_path)
|
||||
# Clean adapters_by_id for AdaptiveRoutingPolicy: keep stable string keys only.
|
||||
string_keyed = {key: value for key, value in adapters_by_id.items() if isinstance(key, str)}
|
||||
return AdaptiveRoutingPolicy(
|
||||
rules=rules,
|
||||
ledger=ledger,
|
||||
adapters_by_id=dict(adapters_by_id),
|
||||
adapters_by_id=string_keyed,
|
||||
)
|
||||
|
||||
|
||||
def _resolve_ledger(
|
||||
config: RoutingConfig, workspace: str | Path | None, *, required: bool
|
||||
) -> Any:
|
||||
from llm_connect.quality import QualityLedger
|
||||
|
||||
if not config.ledger_path:
|
||||
if required:
|
||||
raise InfospaceError(
|
||||
"missing_routing_ledger_for_shadow",
|
||||
"Shadow sampling requires a ledger_path in the routing config",
|
||||
{"config_ledger_path": config.ledger_path},
|
||||
)
|
||||
return None
|
||||
ledger_path = Path(config.ledger_path)
|
||||
if not ledger_path.is_absolute() and workspace is not None:
|
||||
ledger_path = Path(workspace) / ledger_path
|
||||
ledger_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
return QualityLedger(path=ledger_path)
|
||||
|
||||
|
||||
def _default_adapter_factory(
|
||||
candidate: RoutingCandidateConfig, env: Mapping[str, str]
|
||||
) -> Any:
|
||||
|
||||
Reference in New Issue
Block a user