feat(infospace,llm): stabilize free-tier eval workflow

Five improvements that eliminate most of the agent-in-the-loop friction observed while closing out the 988-entity WoN evaluation (C.1): 1. Gemini adapter now retries on 429 + 5xx with exponential backoff (same pattern already used by OpenRouter/OpenAI). Removes the need for shell-level retry wrappers when hitting free-tier rate limits. 2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …") instead of a bare "ERROR", so agents don't have to drop into Python to diagnose transient failures. 3. --entity/--chapter now respect existing evaluation files by default (previously only the full-collection pass did). New --force flag opts into re-evaluation. Stops silently burning free-tier quota on re-runs of the same slug. 4. --entity accepts hyphenated slugs (matching entity filenames) and normalizes them to the underscore form used on disk. On a miss the CLI suggests near matches instead of a bare "not found". 5. eval-summary --update-metrics is no longer destructive: read_metrics_file/write_metrics_file preserve structured values (type_distribution) and don't flatten ints to floats. Fixes a silent data loss observed on every run. Bonus: the evaluator field in written evaluation frontmatter now falls back from run_config.model_name to the adapter's resolved model (or the model echoed back in the API response), so rows no longer show `evaluator: null` when --model is omitted. Tests: new tests/unit/llm/test_gemini.py covers retry behavior; tests/unit/infospace/test_history.py gains a round-trip test that pins the type_distribution / int-preservation invariants. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 00:51:00 +02:00
parent 965508ec06
commit c0615c2d50
6 changed files with 210 additions and 27 deletions
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -237,7 +237,9 @@ def _entities_by_type(cfg, root: "Path", entity_list: list) -> None:
@click.option("--model", default=None, help="LLM model name.")
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
-def evaluate(config_path, provider, model, entity_slug, chapter):
+@click.option("--force", is_flag=True, default=False,
+              help="Re-evaluate entities whose evaluation file already exists.")
+def evaluate(config_path, provider, model, entity_slug, chapter, force):
    """Evaluate entities using LLM-based quality assessment."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent
@@ -252,32 +254,44 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
        click.echo("No entities to evaluate.")
        return

-    # Filter
+    # Filter. Accept hyphenated input for --entity by normalizing to the
+    # underscore slug format produced by parse_entity_directory.
    if entity_slug:
-        entity_list = [e for e in entity_list if e.slug == entity_slug]
-        if not entity_list:
-            click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
+        normalized = entity_slug.replace("-", "_")
+        matches = [e for e in entity_list if e.slug == normalized]
+        if not matches:
+            # Build a short "did you mean…" list from entities sharing a stem.
+            stem = normalized.split("_", 1)[0]
+            near = sorted(e.slug for e in entity_list if e.slug.startswith(stem))[:5]
+            msg = f"Error: Entity '{entity_slug}' not found."
+            if near:
+                msg += f" Did you mean: {', '.join(near)} ?"
+            click.echo(msg, err=True)
            raise SystemExit(1)
+        entity_list = matches
    elif chapter:
        entity_list = [e for e in entity_list if chapter in e.source_chapter]
        if not entity_list:
            click.echo(f"No entities found for chapter '{chapter}'.")
            return

-    # Skip entities that already have evaluation files (incremental resume)
+    # Skip entities that already have evaluation files (incremental resume).
+    # Applies uniformly to full-pass, --entity, and --chapter runs unless
+    # --force is set.
    from markitect.infospace.evaluate import run_entity_evaluation
    output_dir = root / cfg.evaluations_dir
-    if not entity_slug and not chapter and output_dir.is_dir():
-        previous_digests = {
-            p.stem: ""  # non-empty sentinel → triggers skip in BatchEvaluator
-            for p in output_dir.glob("*.md")
-        }
-        entity_list = [e for e in entity_list if e.slug not in previous_digests]
+    if not force and output_dir.is_dir():
+        existing = {p.stem for p in output_dir.glob("*.md")}
+        before = len(entity_list)
+        entity_list = [e for e in entity_list if e.slug not in existing]
+        skipped = before - len(entity_list)
        if not entity_list:
-            click.echo("All entities already evaluated. Nothing to do.")
+            click.echo("All selected entities already evaluated. "
+                       "Re-run with --force to overwrite.")
            return
-        if previous_digests:
-            click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
+        if skipped:
+            click.echo(f"Skipping {skipped} already-evaluated entities. "
+                       "Use --force to re-evaluate.")

    # Create adapter
    from markitect.llm import create_adapter
@@ -285,10 +299,14 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
    adapter = create_adapter(provider, model=model)
    run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)

-    # Progress callback
+    # Progress callback — surface error detail so agents don't have to
+    # drop into Python to see whether an ERROR was 429, 503, or auth.
    def on_progress(done, total, result):
        status = result.status.upper()
-        click.echo(f"  [{done}/{total}] {result.key}: {status}")
+        if status == "ERROR" and result.error:
+            click.echo(f"  [{done}/{total}] {result.key}: ERROR — {result.error}")
+        else:
+            click.echo(f"  [{done}/{total}] {result.key}: {status}")

    click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")