feat(infospace,llm): stabilize free-tier eval workflow
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled

Five improvements that eliminate most of the agent-in-the-loop friction
observed while closing out the 988-entity WoN evaluation (C.1):

1. Gemini adapter now retries on 429 + 5xx with exponential backoff
   (same pattern already used by OpenRouter/OpenAI). Removes the need
   for shell-level retry wrappers when hitting free-tier rate limits.

2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …")
   instead of a bare "ERROR", so agents don't have to drop into Python
   to diagnose transient failures.

3. --entity/--chapter now respect existing evaluation files by default
   (previously only the full-collection pass did). New --force flag
   opts into re-evaluation. Stops silently burning free-tier quota on
   re-runs of the same slug.

4. --entity accepts hyphenated slugs (matching entity filenames) and
   normalizes them to the underscore form used on disk. On a miss the
   CLI suggests near matches instead of a bare "not found".

5. eval-summary --update-metrics is no longer destructive:
   read_metrics_file/write_metrics_file preserve structured values
   (type_distribution) and don't flatten ints to floats. Fixes a
   silent data loss observed on every run.

Bonus: the evaluator field in written evaluation frontmatter now
falls back from run_config.model_name to the adapter's resolved model
(or the model echoed back in the API response), so rows no longer
show `evaluator: null` when --model is omitted.

Tests: new tests/unit/llm/test_gemini.py covers retry behavior;
tests/unit/infospace/test_history.py gains a round-trip test that
pins the type_distribution / int-preservation invariants.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-22 00:51:00 +02:00
parent 965508ec06
commit c0615c2d50
6 changed files with 210 additions and 27 deletions

View File

@@ -237,7 +237,9 @@ def _entities_by_type(cfg, root: "Path", entity_list: list) -> None:
@click.option("--model", default=None, help="LLM model name.")
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
def evaluate(config_path, provider, model, entity_slug, chapter):
@click.option("--force", is_flag=True, default=False,
help="Re-evaluate entities whose evaluation file already exists.")
def evaluate(config_path, provider, model, entity_slug, chapter, force):
"""Evaluate entities using LLM-based quality assessment."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
@@ -252,32 +254,44 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
click.echo("No entities to evaluate.")
return
# Filter
# Filter. Accept hyphenated input for --entity by normalizing to the
# underscore slug format produced by parse_entity_directory.
if entity_slug:
entity_list = [e for e in entity_list if e.slug == entity_slug]
if not entity_list:
click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
normalized = entity_slug.replace("-", "_")
matches = [e for e in entity_list if e.slug == normalized]
if not matches:
# Build a short "did you mean…" list from entities sharing a stem.
stem = normalized.split("_", 1)[0]
near = sorted(e.slug for e in entity_list if e.slug.startswith(stem))[:5]
msg = f"Error: Entity '{entity_slug}' not found."
if near:
msg += f" Did you mean: {', '.join(near)} ?"
click.echo(msg, err=True)
raise SystemExit(1)
entity_list = matches
elif chapter:
entity_list = [e for e in entity_list if chapter in e.source_chapter]
if not entity_list:
click.echo(f"No entities found for chapter '{chapter}'.")
return
# Skip entities that already have evaluation files (incremental resume)
# Skip entities that already have evaluation files (incremental resume).
# Applies uniformly to full-pass, --entity, and --chapter runs unless
# --force is set.
from markitect.infospace.evaluate import run_entity_evaluation
output_dir = root / cfg.evaluations_dir
if not entity_slug and not chapter and output_dir.is_dir():
previous_digests = {
p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator
for p in output_dir.glob("*.md")
}
entity_list = [e for e in entity_list if e.slug not in previous_digests]
if not force and output_dir.is_dir():
existing = {p.stem for p in output_dir.glob("*.md")}
before = len(entity_list)
entity_list = [e for e in entity_list if e.slug not in existing]
skipped = before - len(entity_list)
if not entity_list:
click.echo("All entities already evaluated. Nothing to do.")
click.echo("All selected entities already evaluated. "
"Re-run with --force to overwrite.")
return
if previous_digests:
click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
if skipped:
click.echo(f"Skipping {skipped} already-evaluated entities. "
"Use --force to re-evaluate.")
# Create adapter
from markitect.llm import create_adapter
@@ -285,10 +299,14 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
adapter = create_adapter(provider, model=model)
run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)
# Progress callback
# Progress callback — surface error detail so agents don't have to
# drop into Python to see whether an ERROR was 429, 503, or auth.
def on_progress(done, total, result):
status = result.status.upper()
click.echo(f" [{done}/{total}] {result.key}: {status}")
if status == "ERROR" and result.error:
click.echo(f" [{done}/{total}] {result.key}: ERROR — {result.error}")
else:
click.echo(f" [{done}/{total}] {result.key}: {status}")
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")