feat(infospace,llm): stabilize free-tier eval workflow
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled

Five improvements that eliminate most of the agent-in-the-loop friction
observed while closing out the 988-entity WoN evaluation (C.1):

1. Gemini adapter now retries on 429 + 5xx with exponential backoff
   (same pattern already used by OpenRouter/OpenAI). Removes the need
   for shell-level retry wrappers when hitting free-tier rate limits.

2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …")
   instead of a bare "ERROR", so agents don't have to drop into Python
   to diagnose transient failures.

3. --entity/--chapter now respect existing evaluation files by default
   (previously only the full-collection pass did). New --force flag
   opts into re-evaluation. Stops silently burning free-tier quota on
   re-runs of the same slug.

4. --entity accepts hyphenated slugs (matching entity filenames) and
   normalizes them to the underscore form used on disk. On a miss the
   CLI suggests near matches instead of a bare "not found".

5. eval-summary --update-metrics is no longer destructive:
   read_metrics_file/write_metrics_file preserve structured values
   (type_distribution) and don't flatten ints to floats. Fixes a
   silent data loss observed on every run.

Bonus: the evaluator field in written evaluation frontmatter now
falls back from run_config.model_name to the adapter's resolved model
(or the model echoed back in the API response), so rows no longer
show `evaluator: null` when --model is omitted.

Tests: new tests/unit/llm/test_gemini.py covers retry behavior;
tests/unit/infospace/test_history.py gains a round-trip test that
pins the type_distribution / int-preservation invariants.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
2026-04-22 00:51:00 +02:00
parent 965508ec06
commit c0615c2d50
6 changed files with 210 additions and 27 deletions

View File

@@ -237,7 +237,9 @@ def _entities_by_type(cfg, root: "Path", entity_list: list) -> None:
@click.option("--model", default=None, help="LLM model name.")
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
def evaluate(config_path, provider, model, entity_slug, chapter):
@click.option("--force", is_flag=True, default=False,
help="Re-evaluate entities whose evaluation file already exists.")
def evaluate(config_path, provider, model, entity_slug, chapter, force):
"""Evaluate entities using LLM-based quality assessment."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
@@ -252,32 +254,44 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
click.echo("No entities to evaluate.")
return
# Filter
# Filter. Accept hyphenated input for --entity by normalizing to the
# underscore slug format produced by parse_entity_directory.
if entity_slug:
entity_list = [e for e in entity_list if e.slug == entity_slug]
if not entity_list:
click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
normalized = entity_slug.replace("-", "_")
matches = [e for e in entity_list if e.slug == normalized]
if not matches:
# Build a short "did you mean…" list from entities sharing a stem.
stem = normalized.split("_", 1)[0]
near = sorted(e.slug for e in entity_list if e.slug.startswith(stem))[:5]
msg = f"Error: Entity '{entity_slug}' not found."
if near:
msg += f" Did you mean: {', '.join(near)} ?"
click.echo(msg, err=True)
raise SystemExit(1)
entity_list = matches
elif chapter:
entity_list = [e for e in entity_list if chapter in e.source_chapter]
if not entity_list:
click.echo(f"No entities found for chapter '{chapter}'.")
return
# Skip entities that already have evaluation files (incremental resume)
# Skip entities that already have evaluation files (incremental resume).
# Applies uniformly to full-pass, --entity, and --chapter runs unless
# --force is set.
from markitect.infospace.evaluate import run_entity_evaluation
output_dir = root / cfg.evaluations_dir
if not entity_slug and not chapter and output_dir.is_dir():
previous_digests = {
p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator
for p in output_dir.glob("*.md")
}
entity_list = [e for e in entity_list if e.slug not in previous_digests]
if not force and output_dir.is_dir():
existing = {p.stem for p in output_dir.glob("*.md")}
before = len(entity_list)
entity_list = [e for e in entity_list if e.slug not in existing]
skipped = before - len(entity_list)
if not entity_list:
click.echo("All entities already evaluated. Nothing to do.")
click.echo("All selected entities already evaluated. "
"Re-run with --force to overwrite.")
return
if previous_digests:
click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
if skipped:
click.echo(f"Skipping {skipped} already-evaluated entities. "
"Use --force to re-evaluate.")
# Create adapter
from markitect.llm import create_adapter
@@ -285,10 +299,14 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
adapter = create_adapter(provider, model=model)
run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)
# Progress callback
# Progress callback — surface error detail so agents don't have to
# drop into Python to see whether an ERROR was 429, 503, or auth.
def on_progress(done, total, result):
status = result.status.upper()
click.echo(f" [{done}/{total}] {result.key}: {status}")
if status == "ERROR" and result.error:
click.echo(f" [{done}/{total}] {result.key}: ERROR — {result.error}")
else:
click.echo(f" [{done}/{total}] {result.key}: {status}")
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")

View File

@@ -195,12 +195,23 @@ def run_entity_evaluation(
"""
topic = config.topic.name
evaluations_path = output_dir or Path(config.evaluations_dir)
evaluator_name = (run_config.model_name if run_config else "unknown")
# Fall back from run_config.model_name (may be None if the CLI user did
# not pass --model) to the adapter's resolved model, and only then to
# "unknown". Keeps the evaluator field in the written frontmatter
# informative for later audits.
default_evaluator = (
(run_config.model_name if run_config else None)
or getattr(adapter, "_model", None)
or "unknown"
)
def _write_and_notify(done: int, total: int, result) -> None:
# Write file immediately on success (incremental — run is resumable)
if result.status == "success" and result.response is not None:
scores = parse_evaluation_response(result.response.content, dimensions)
# Prefer the model name the adapter actually echoed back — it
# reflects post-resolution fallbacks (e.g. flash → flash-lite).
evaluator_name = result.response.model or default_evaluator
evaluation = EntityEvaluation(
entity_slug=result.key,
evaluator=evaluator_name,

View File

@@ -81,17 +81,26 @@ def snapshot_from_checks(
# ── Metrics file I/O ────────────────────────────────────────────────
def write_metrics_file(metrics: Dict[str, float], path: Path) -> None:
def write_metrics_file(metrics: Dict[str, Any], path: Path) -> None:
"""Write the latest metrics to a simple YAML file.
This file is used by ``markitect infospace viability`` for quick
threshold checking.
threshold checking. Non-numeric values (e.g. ``type_distribution``)
are passed through unchanged; floats are rounded to 6 dp; ints are
preserved as ints so external consumers don't see ``29`` silently
become ``29.0`` on every round-trip.
"""
def _normalize(v: Any) -> Any:
if isinstance(v, bool):
return v
if isinstance(v, float):
return round(v, 6)
return v
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(
yaml.safe_dump(
{k: round(v, 6) if isinstance(v, float) else v
for k, v in sorted(metrics.items())},
{k: _normalize(v) for k, v in sorted(metrics.items())},
default_flow_style=False,
sort_keys=True,
),
@@ -99,14 +108,20 @@ def write_metrics_file(metrics: Dict[str, float], path: Path) -> None:
)
def read_metrics_file(path: Path) -> Dict[str, float]:
"""Read the latest metrics from a YAML file."""
def read_metrics_file(path: Path) -> Dict[str, Any]:
"""Read the latest metrics from a YAML file.
Returns all keys as written on disk, preserving types verbatim so a
round-trip via :func:`write_metrics_file` does not silently drop
structured values (e.g. ``type_distribution``) or flatten ints to
floats.
"""
if not path.is_file():
return {}
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
if not isinstance(raw, dict):
return {}
return {k: float(v) for k, v in raw.items() if isinstance(v, (int, float))}
return raw
# ── History operations ───────────────────────────────────────────────

View File

@@ -9,7 +9,11 @@ from markitect.llm.adapter import LLMAdapter
from markitect.llm.models import RunConfig, LLMResponse
from markitect.llm.config import resolve_api_key, find_project_root
from markitect.llm._http import post_json
from markitect.llm.exceptions import LLMConfigurationError
from markitect.llm.exceptions import (
LLMConfigurationError,
LLMAPIError,
LLMRateLimitError,
)
_DEFAULT_MODEL = "gemini-2.5-flash"
_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
@@ -26,10 +30,12 @@ class GeminiAdapter(LLMAdapter):
model: Optional[str] = None,
api_key: Optional[str] = None,
system_prompt: Optional[str] = None,
max_retries: int = 3,
**_kwargs: Any,
):
self._model = model or _DEFAULT_MODEL
self._system_prompt = system_prompt
self._max_retries = max_retries
root = find_project_root()
key_file_paths = [root / "apikey-geminifree.txt"] if root else []
@@ -77,7 +83,7 @@ class GeminiAdapter(LLMAdapter):
url = f"{_API_BASE}/models/{model}:generateContent?key={self._api_key}"
start = time.time()
data = post_json(url, payload, timeout=config.timeout_seconds)
data = self._post_with_retries(url, payload, timeout=config.timeout_seconds)
latency = time.time() - start
# Parse Gemini response
@@ -113,3 +119,27 @@ class GeminiAdapter(LLMAdapter):
if not (0.0 <= config.temperature <= 2.0):
return False
return True
# ── Internals ───────────────────────────────────────────────────
def _post_with_retries(
self,
url: str,
payload: Dict[str, Any],
timeout: int,
) -> Dict[str, Any]:
last_exc: Optional[Exception] = None
for attempt in range(self._max_retries + 1):
try:
return post_json(url, payload, timeout=timeout)
except LLMRateLimitError as exc:
last_exc = exc
if attempt < self._max_retries:
time.sleep(2 ** attempt)
except LLMAPIError as exc:
if exc.status_code in (502, 503, 504) and attempt < self._max_retries:
last_exc = exc
time.sleep(2 ** attempt)
else:
raise
raise last_exc # type: ignore[misc]

View File

@@ -124,6 +124,33 @@ class TestMetricsFileIO:
path.write_text("just a string", encoding="utf-8")
assert read_metrics_file(path) == {}
def test_round_trip_preserves_structured_values(self, tmp_path):
"""Non-numeric values like type_distribution must survive a round-trip.
Regression: eval-summary --update-metrics used to drop any key
whose value wasn't a bare number, silently erasing type_distribution
from the file on every run.
"""
path = tmp_path / "metrics.yaml"
metrics = {
"per_entity_mean": 3.9567,
"vsm_type_matrix_cells": 29,
"type_distribution": {
"Element": 315,
"Institution": 122,
"Principle": 102,
},
}
write_metrics_file(metrics, path)
loaded = read_metrics_file(path)
assert loaded["type_distribution"] == {
"Element": 315, "Institution": 122, "Principle": 102,
}
# And the int stayed an int on disk, not 29.0.
raw = path.read_text(encoding="utf-8")
assert "vsm_type_matrix_cells: 29\n" in raw
assert "vsm_type_matrix_cells: 29.0" not in raw
# ── record_check_results ────────────────────────────────────────────

View File

@@ -0,0 +1,82 @@
"""Tests for markitect.llm.gemini — retry behavior + happy path."""
from unittest import mock
import pytest
from markitect.llm.gemini import GeminiAdapter
from markitect.llm.exceptions import LLMAPIError, LLMRateLimitError
from markitect.prompts.execution.models import RunConfig, LLMResponse
def _api_response(text="hello", model="gemini-2.5-flash"):
return {
"candidates": [
{
"content": {"parts": [{"text": text}], "role": "model"},
"finishReason": "STOP",
}
],
"modelVersion": model,
"usageMetadata": {
"promptTokenCount": 3,
"candidatesTokenCount": 2,
"totalTokenCount": 5,
},
}
class TestGeminiAdapter:
def _adapter(self, **kwargs):
defaults = {"api_key": "AIza-test"}
defaults.update(kwargs)
return GeminiAdapter(**defaults)
@mock.patch("markitect.llm.gemini.post_json")
def test_success(self, mock_post):
mock_post.return_value = _api_response("generated")
adapter = self._adapter()
resp = adapter.execute_prompt("hi", RunConfig())
assert isinstance(resp, LLMResponse)
assert resp.content == "generated"
assert resp.metadata["provider"] == "gemini"
@mock.patch("markitect.llm.gemini.post_json")
@mock.patch("markitect.llm.gemini.time.sleep")
def test_retry_on_429(self, mock_sleep, mock_post):
mock_post.side_effect = [
LLMRateLimitError("rate limited", status_code=429),
_api_response("recovered"),
]
adapter = self._adapter(max_retries=2)
resp = adapter.execute_prompt("hi", RunConfig())
assert resp.content == "recovered"
assert mock_sleep.call_count == 1
@mock.patch("markitect.llm.gemini.post_json")
@mock.patch("markitect.llm.gemini.time.sleep")
def test_retry_on_503(self, mock_sleep, mock_post):
mock_post.side_effect = [
LLMAPIError("unavailable", status_code=503),
_api_response("back"),
]
adapter = self._adapter(max_retries=2)
resp = adapter.execute_prompt("hi", RunConfig())
assert resp.content == "back"
@mock.patch("markitect.llm.gemini.post_json")
def test_no_retry_on_400(self, mock_post):
mock_post.side_effect = LLMAPIError("bad request", status_code=400)
adapter = self._adapter(max_retries=2)
with pytest.raises(LLMAPIError) as exc_info:
adapter.execute_prompt("hi", RunConfig())
assert exc_info.value.status_code == 400
@mock.patch("markitect.llm.gemini.post_json")
@mock.patch("markitect.llm.gemini.time.sleep")
def test_exhausted_retries_raises(self, mock_sleep, mock_post):
mock_post.side_effect = LLMRateLimitError("rate limited", status_code=429)
adapter = self._adapter(max_retries=1)
with pytest.raises(LLMRateLimitError):
adapter.execute_prompt("hi", RunConfig())
assert mock_sleep.call_count == 1 # 1 retry before giving up