feat(infospace,llm): stabilize free-tier eval workflow
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Five improvements that eliminate most of the agent-in-the-loop friction
observed while closing out the 988-entity WoN evaluation (C.1):
1. Gemini adapter now retries on 429 + 5xx with exponential backoff
(same pattern already used by OpenRouter/OpenAI). Removes the need
for shell-level retry wrappers when hitting free-tier rate limits.
2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …")
instead of a bare "ERROR", so agents don't have to drop into Python
to diagnose transient failures.
3. --entity/--chapter now respect existing evaluation files by default
(previously only the full-collection pass did). New --force flag
opts into re-evaluation. Stops silently burning free-tier quota on
re-runs of the same slug.
4. --entity accepts hyphenated slugs (matching entity filenames) and
normalizes them to the underscore form used on disk. On a miss the
CLI suggests near matches instead of a bare "not found".
5. eval-summary --update-metrics is no longer destructive:
read_metrics_file/write_metrics_file preserve structured values
(type_distribution) and don't flatten ints to floats. Fixes a
silent data loss observed on every run.
Bonus: the evaluator field in written evaluation frontmatter now
falls back from run_config.model_name to the adapter's resolved model
(or the model echoed back in the API response), so rows no longer
show `evaluator: null` when --model is omitted.
Tests: new tests/unit/llm/test_gemini.py covers retry behavior;
tests/unit/infospace/test_history.py gains a round-trip test that
pins the type_distribution / int-preservation invariants.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -237,7 +237,9 @@ def _entities_by_type(cfg, root: "Path", entity_list: list) -> None:
|
||||
@click.option("--model", default=None, help="LLM model name.")
|
||||
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
|
||||
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
|
||||
def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
@click.option("--force", is_flag=True, default=False,
|
||||
help="Re-evaluate entities whose evaluation file already exists.")
|
||||
def evaluate(config_path, provider, model, entity_slug, chapter, force):
|
||||
"""Evaluate entities using LLM-based quality assessment."""
|
||||
cfg, cfg_path = _load_config_or_exit(config_path)
|
||||
root = cfg_path.parent
|
||||
@@ -252,32 +254,44 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
click.echo("No entities to evaluate.")
|
||||
return
|
||||
|
||||
# Filter
|
||||
# Filter. Accept hyphenated input for --entity by normalizing to the
|
||||
# underscore slug format produced by parse_entity_directory.
|
||||
if entity_slug:
|
||||
entity_list = [e for e in entity_list if e.slug == entity_slug]
|
||||
if not entity_list:
|
||||
click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
|
||||
normalized = entity_slug.replace("-", "_")
|
||||
matches = [e for e in entity_list if e.slug == normalized]
|
||||
if not matches:
|
||||
# Build a short "did you mean…" list from entities sharing a stem.
|
||||
stem = normalized.split("_", 1)[0]
|
||||
near = sorted(e.slug for e in entity_list if e.slug.startswith(stem))[:5]
|
||||
msg = f"Error: Entity '{entity_slug}' not found."
|
||||
if near:
|
||||
msg += f" Did you mean: {', '.join(near)} ?"
|
||||
click.echo(msg, err=True)
|
||||
raise SystemExit(1)
|
||||
entity_list = matches
|
||||
elif chapter:
|
||||
entity_list = [e for e in entity_list if chapter in e.source_chapter]
|
||||
if not entity_list:
|
||||
click.echo(f"No entities found for chapter '{chapter}'.")
|
||||
return
|
||||
|
||||
# Skip entities that already have evaluation files (incremental resume)
|
||||
# Skip entities that already have evaluation files (incremental resume).
|
||||
# Applies uniformly to full-pass, --entity, and --chapter runs unless
|
||||
# --force is set.
|
||||
from markitect.infospace.evaluate import run_entity_evaluation
|
||||
output_dir = root / cfg.evaluations_dir
|
||||
if not entity_slug and not chapter and output_dir.is_dir():
|
||||
previous_digests = {
|
||||
p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator
|
||||
for p in output_dir.glob("*.md")
|
||||
}
|
||||
entity_list = [e for e in entity_list if e.slug not in previous_digests]
|
||||
if not force and output_dir.is_dir():
|
||||
existing = {p.stem for p in output_dir.glob("*.md")}
|
||||
before = len(entity_list)
|
||||
entity_list = [e for e in entity_list if e.slug not in existing]
|
||||
skipped = before - len(entity_list)
|
||||
if not entity_list:
|
||||
click.echo("All entities already evaluated. Nothing to do.")
|
||||
click.echo("All selected entities already evaluated. "
|
||||
"Re-run with --force to overwrite.")
|
||||
return
|
||||
if previous_digests:
|
||||
click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
|
||||
if skipped:
|
||||
click.echo(f"Skipping {skipped} already-evaluated entities. "
|
||||
"Use --force to re-evaluate.")
|
||||
|
||||
# Create adapter
|
||||
from markitect.llm import create_adapter
|
||||
@@ -285,10 +299,14 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
adapter = create_adapter(provider, model=model)
|
||||
run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)
|
||||
|
||||
# Progress callback
|
||||
# Progress callback — surface error detail so agents don't have to
|
||||
# drop into Python to see whether an ERROR was 429, 503, or auth.
|
||||
def on_progress(done, total, result):
|
||||
status = result.status.upper()
|
||||
click.echo(f" [{done}/{total}] {result.key}: {status}")
|
||||
if status == "ERROR" and result.error:
|
||||
click.echo(f" [{done}/{total}] {result.key}: ERROR — {result.error}")
|
||||
else:
|
||||
click.echo(f" [{done}/{total}] {result.key}: {status}")
|
||||
|
||||
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user