generated from coulomb/repo-seed
IB-WP-0018-T03+T04: shadow sampling + report/CLI surfacing; close IB-WP-0018
T03 — wrap_with_shadow_sampling() helper in routing.py: builds a llm-connect ShadowingAdapter around any candidate LLMAdapter with a caller-supplied baseline, grader, and QualityLedger. async_shadow=True by default so production load is not doubled; on_shadow_error escape hatch keeps caller logs informed when a baseline outage swallows the shadow path. The returned adapter is still an LLMAdapter so it slots into a RoutingPolicy rule without further code change. T04 — generation report enrichment plus a small CLI helper: - _collect_adapter_choices walks artifact provenance, groups by (stage_id, adapter_id), and surfaces calls + prompt/completion tokens per (stage, adapter) pair in a new ## Per-stage adapter choices section. Runs that did not go through the bridge have no provider_metadata.adapter_id and emit an empty list, so fixture-only reports stay terse. - summarise_quality_ledger() rolls a llm-connect QualityLedger up by (task_type, adapter_id) with mean quality, mean cost, observations, and cumulative tokens. - infospace-bench routing ledger <path> CLI prints the rollup as JSON. Five new tests cover shadow happy-path, shadow failure isolation, ledger rollup, the routing CLI, and the report's adapter-choice aggregation. Closes IB-WP-0018: T01-T05 are all done and the workplan status flips from blocked to done now that LLM-WP-0004's primitives have shipped. 144 tests pass, 1 skipped (the OpenRouter live smoke, gated as before). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -791,6 +791,15 @@ def _write_generation_report(root: Path, metrics: dict[str, Any], snapshot_id: s
|
||||
"",
|
||||
]
|
||||
)
|
||||
if review.get("adapter_choices"):
|
||||
lines.extend(["## Per-stage adapter choices", ""])
|
||||
for row in review["adapter_choices"]:
|
||||
lines.append(
|
||||
f"- `{row['stage_id']}` ({row['task_type']}) -> "
|
||||
f"`{row['adapter_id']}` · {row['calls']} call(s) · "
|
||||
f"{row['prompt_tokens']} prompt + {row['completion_tokens']} completion tokens"
|
||||
)
|
||||
lines.append("")
|
||||
text = "\n".join(lines)
|
||||
path = root / "reports" / "generation-summary.md"
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
@@ -872,15 +881,55 @@ def _collect_review_report(root: Path) -> dict[str, Any]:
|
||||
entity_titles = sorted(
|
||||
{item.title for item in infospace.artifacts if item.kind == "entity" and item.title}
|
||||
)
|
||||
adapter_choices = _collect_adapter_choices(generated)
|
||||
return {
|
||||
"chapter_coverage": chapter_coverage,
|
||||
"entity_titles": entity_titles,
|
||||
"unmapped_sources": unmapped,
|
||||
"page_anchor_total": len(anchors),
|
||||
"page_anchor_sample": anchors[:6],
|
||||
"adapter_choices": adapter_choices,
|
||||
}
|
||||
|
||||
|
||||
def _collect_adapter_choices(generated: list[Any]) -> list[dict[str, Any]]:
|
||||
"""Roll up which adapter ran each stage when the routing bridge was used.
|
||||
|
||||
Returns one row per (stage_id, adapter_id) with call counts and
|
||||
cumulative tokens. Entries without provider_metadata are skipped so
|
||||
fixture-only runs produce an empty list rather than a noisy section.
|
||||
"""
|
||||
buckets: dict[tuple[str, str], dict[str, Any]] = {}
|
||||
for item in generated:
|
||||
provenance = item.provenance or {}
|
||||
metadata = provenance.get("provider_metadata") or {}
|
||||
if not isinstance(metadata, dict):
|
||||
continue
|
||||
adapter_id = str(metadata.get("adapter_id") or metadata.get("model") or "")
|
||||
if not adapter_id:
|
||||
continue
|
||||
stage_id = str(metadata.get("stage_id") or provenance.get("stage_id") or "")
|
||||
if not stage_id:
|
||||
continue
|
||||
usage = metadata.get("usage") or {}
|
||||
key = (stage_id, adapter_id)
|
||||
bucket = buckets.setdefault(
|
||||
key,
|
||||
{
|
||||
"stage_id": stage_id,
|
||||
"adapter_id": adapter_id,
|
||||
"task_type": metadata.get("task_type") or stage_id,
|
||||
"calls": 0,
|
||||
"prompt_tokens": 0,
|
||||
"completion_tokens": 0,
|
||||
},
|
||||
)
|
||||
bucket["calls"] += 1
|
||||
bucket["prompt_tokens"] += int(usage.get("prompt_tokens") or 0)
|
||||
bucket["completion_tokens"] += int(usage.get("completion_tokens") or 0)
|
||||
return sorted(buckets.values(), key=lambda row: (row["stage_id"], row["adapter_id"]))
|
||||
|
||||
|
||||
def _workflow_ids_for_stage(stage: str) -> list[str]:
|
||||
normalized = stage.strip().lower()
|
||||
if normalized == "intake":
|
||||
|
||||
Reference in New Issue
Block a user