From 3ca891de4a55df1c7107864a1967310a9a6214be Mon Sep 17 00:00:00 2001
From: tegwick <bernd.worsch@gmail.com>
Date: Tue, 19 May 2026 04:30:33 +0200
Subject: [PATCH] fix: review findings from Lefevre live smoke

Two small fixes informed by the 2026-05-18 live OpenRouter chapter-I run.

1. extract-entities templates (trading-literature and general-knowledge):
   the # Entity Title placeholder was interpreted by gpt-4o-mini as a
   literal heading prefix, so every entity came back as "# Entity Title:
   Bucket Shop" etc. The instruction now spells the placeholder out
   with concrete examples and an explicit "not the literal string"
   note, so smaller models hit the intended shape.

2. generate plan grows --model <id>. When supplied, the cost estimate
   pulls per-prompt and per-completion rates from the bundled
   model_rates.yaml instead of multiplying a single blended
   --cost-per-1k value across all tokens. The summary now also returns
   a separate estimated_completion_tokens field plus a cost_source tag
   ("rate_table:<model>" | "cost_per_1k_blended" | None).

This is a stopgap. LLM-WP-0005 (proposed in llm-connect this round)
will move the rate registry and token-shape problem classes upstream
so consumers stop re-implementing them.

The live smoke ran 28k prompt tokens / 7.5k completion / $0.0088
actual. With --model openai/gpt-4o-mini the plan estimate now lands at
$0.0076 (within 14% of actual) versus the prior $8.40 estimate at
--cost-per-1k 0.30.

181 tests pass, 2 skipped (both live OpenRouter smokes).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/infospace_bench/cli.py                    |  6 ++-
 src/infospace_bench/generator.py              | 28 ++++++++++++-
 .../templates/extract-entities.md             |  8 ++--
 .../templates/extract-entities.md             |  6 ++-
 tests/test_plan_scale.py                      | 39 +++++++++++++++++++
 5 files changed, 80 insertions(+), 7 deletions(-)
diff --git a/src/infospace_bench/cli.py b/src/infospace_bench/cli.py
index db78c5c..dbfffe5 100644
--- a/src/infospace_bench/cli.py
+++ b/src/infospace_bench/cli.py
@@ -186,7 +186,10 @@ def build_parser() -> argparse.ArgumentParser:
     generate_plan.add_argument("--max-calls", type=int, default=None)
     generate_plan.add_argument("--cost-cap", type=float, default=None)
     generate_plan.add_argument(
-        "--cost-per-1k", type=float, default=0.0, help="USD per 1k prompt tokens for rough cost estimate"
+        "--cost-per-1k", type=float, default=0.0, help="USD per 1k prompt tokens for rough cost estimate (override; rate-table lookup via --model wins when present)"
+    )
+    generate_plan.add_argument(
+        "--model", default="", help="Model id (e.g. openai/gpt-4o-mini); when set, the bundled rate table replaces --cost-per-1k for the estimate"
     )
     generate_plan.add_argument(
         "--entities-per-chunk", type=int, default=2, help="Estimate of entities each chunk yields"
@@ -551,6 +554,7 @@ def main(argv: list[str] | None = None) -> int:
                         max_calls=args.max_calls,
                         cost_cap=args.cost_cap,
                         cost_per_1k_tokens=args.cost_per_1k,
+                        model=args.model or None,
                         entities_per_chunk=args.entities_per_chunk,
                         full=args.full,
                     )
diff --git a/src/infospace_bench/generator.py b/src/infospace_bench/generator.py
index 354eaf4..8786f19 100644
--- a/src/infospace_bench/generator.py
+++ b/src/infospace_bench/generator.py
@@ -144,6 +144,7 @@ def plan_generation(
     max_calls: int | None = None,
     cost_cap: float | None = None,
     cost_per_1k_tokens: float = 0.0,
+    model: str | None = None,
     words_per_token: float = WORDS_PER_TOKEN_DEFAULT,
     entities_per_chunk: int = ENTITIES_PER_CHUNK_ESTIMATE,
     full: bool = False,
@@ -161,6 +162,7 @@ def plan_generation(
         max_calls=max_calls,
         cost_cap=cost_cap,
         cost_per_1k_tokens=cost_per_1k_tokens,
+        model=model,
         words_per_token=words_per_token,
         entities_per_chunk=entities_per_chunk,
     )
@@ -203,6 +205,7 @@ def plan_generation_summary(
     max_calls: int | None = None,
     cost_cap: float | None = None,
     cost_per_1k_tokens: float = 0.0,
+    model: str | None = None,
     words_per_token: float = WORDS_PER_TOKEN_DEFAULT,
     entities_per_chunk: int = ENTITIES_PER_CHUNK_ESTIMATE,
 ) -> dict[str, Any]:
@@ -247,9 +250,29 @@ def plan_generation_summary(
         total_calls += calls
         total_prompt_words += prompt_words
     total_tokens = int(round(total_prompt_words / words_per_token)) if words_per_token > 0 else 0
+    # Estimate completion tokens as a rough fraction of prompt — most workflows
+    # write structured output that's ~20% of the prompt size. T03 of the
+    # cost-estimator workplan will replace this with problem-class estimators
+    # from llm-connect.
+    estimated_completion_tokens = int(round(total_tokens * 0.2))
     cost: float | None = None
-    if cost_per_1k_tokens > 0:
+    cost_source: str | None = None
+    rate_table_entry: dict[str, float] | None = None
+    if model:
+        from .budget import load_rate_table
+
+        rates = load_rate_table(_workspace_for(root_path))
+        rate_table_entry = rates.get(model)
+    if rate_table_entry is not None:
+        cost = round(
+            (total_tokens / 1000.0) * rate_table_entry["prompt_per_1k"]
+            + (estimated_completion_tokens / 1000.0) * rate_table_entry["completion_per_1k"],
+            6,
+        )
+        cost_source = f"rate_table:{model}"
+    elif cost_per_1k_tokens > 0:
         cost = round((total_tokens / 1000.0) * cost_per_1k_tokens, 4)
+        cost_source = "cost_per_1k_blended"
     chapter_numbers = sorted(
         {
             int(item.provenance.get("chapter_number"))
@@ -267,7 +290,10 @@ def plan_generation_summary(
         "total_provider_calls_estimate": total_calls,
         "total_prompt_words_estimate": total_prompt_words,
         "total_prompt_tokens_estimate": total_tokens,
+        "estimated_completion_tokens": estimated_completion_tokens,
         "estimated_cost_usd": cost,
+        "cost_source": cost_source,
+        "model": model,
         "cost_per_1k_tokens": cost_per_1k_tokens or None,
         "words_per_token": words_per_token,
         "entities_per_chunk_estimate": entities_per_chunk,
diff --git a/src/infospace_bench/profiles/general-knowledge/templates/extract-entities.md b/src/infospace_bench/profiles/general-knowledge/templates/extract-entities.md
index 33434f6..af28643 100644
--- a/src/infospace_bench/profiles/general-knowledge/templates/extract-entities.md
+++ b/src/infospace_bench/profiles/general-knowledge/templates/extract-entities.md
@@ -3,9 +3,11 @@
 Profile: {{ macros.profile }}
 
 Extract reusable infospace entities from the source chunk. Return one Markdown
-bundle where each entity starts with `# Entity Title` and contains at least a
-`## Definition` section. Prefer durable concepts, claims, named methods,
-people, places, works, and objects over sentence fragments.
+bundle where each entity starts with a level-1 heading that is the entity's
+own name (e.g. `# Knowledge Artifact`, `# Source Claim` — **not** the literal
+string "Entity Title"). Each entity contains at least a `## Definition`
+section. Prefer durable concepts, claims, named methods, people, places,
+works, and objects over sentence fragments.
 
 Source title: {{ input.title }}
 Source artifact: {{ input.artifact_id }}
diff --git a/src/infospace_bench/profiles/trading-literature/templates/extract-entities.md b/src/infospace_bench/profiles/trading-literature/templates/extract-entities.md
index a3f7524..11dc825 100644
--- a/src/infospace_bench/profiles/trading-literature/templates/extract-entities.md
+++ b/src/infospace_bench/profiles/trading-literature/templates/extract-entities.md
@@ -3,8 +3,10 @@
 Profile: {{ macros.profile }}
 
 Extract reusable infospace entities from the source chunk. Return one
-Markdown bundle where each entity starts with `# Entity Title` and has a
-`## Definition` section, plus a `## Category` line drawn from the list
+Markdown bundle where each entity starts with a level-1 heading that is
+the entity's name (e.g. `# Bucket Shop`, `# Tape Reading`, `# Larry
+Livingston` — **not** the literal string "Entity Title"). Each entity has
+a `## Definition` section and a `## Category` line drawn from the list
 below. Add `## Context` and `## Source Evidence` when the chunk gives
 enough material; leave them out rather than inventing detail.
 
diff --git a/tests/test_plan_scale.py b/tests/test_plan_scale.py
index b2959ce..27fbc5b 100644
--- a/tests/test_plan_scale.py
+++ b/tests/test_plan_scale.py
@@ -115,6 +115,45 @@ def test_plan_caps_flag_when_estimate_exceeds_budget(tmp_path: Path) -> None:
     assert summary["exceeds_cost_cap"] is True
 
 
+def test_plan_with_model_uses_rate_table_instead_of_blended_per_1k(tmp_path: Path) -> None:
+    """--model openai/gpt-4o-mini should pull from bundled rate table.
+
+    Stopgap until LLM-WP-0005 lands a proper cost model in llm-connect.
+    """
+    root = _build_plan_infospace(tmp_path)
+
+    blended = plan_generation_summary(
+        root, cost_per_1k_tokens=0.30, persist=False
+    ) if False else None
+    rate_table = plan_generation_summary(
+        root, model="openai/gpt-4o-mini"
+    )
+
+    # gpt-4o-mini list price is ~0.00015/1k prompt + ~0.0006/1k completion,
+    # so the rate-table cost must be far below the $0.30/1k blended figure.
+    assert rate_table["cost_source"] == "rate_table:openai/gpt-4o-mini"
+    assert rate_table["estimated_cost_usd"] is not None
+    assert rate_table["estimated_cost_usd"] < 0.10, (
+        "rate-table estimate must be far below a $0.30/1k blended rate"
+    )
+    # The estimator now also returns a completion-token estimate.
+    assert rate_table["estimated_completion_tokens"] > 0
+
+
+def test_plan_with_unknown_model_falls_back_to_blended_or_unknown(tmp_path: Path) -> None:
+    root = _build_plan_infospace(tmp_path)
+
+    no_signal = plan_generation_summary(root, model="acme/not-in-rate-table")
+    blended = plan_generation_summary(
+        root, model="acme/not-in-rate-table", cost_per_1k_tokens=0.5
+    )
+
+    assert no_signal["estimated_cost_usd"] is None
+    assert no_signal["cost_source"] is None
+    assert blended["estimated_cost_usd"] is not None
+    assert blended["cost_source"] == "cost_per_1k_blended"
+
+
 def test_plan_full_mode_includes_workflow_plans(tmp_path: Path) -> None:
     root = _build_plan_infospace(tmp_path)