feat(infospace,llm): stabilize free-tier eval workflow
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Five improvements that eliminate most of the agent-in-the-loop friction
observed while closing out the 988-entity WoN evaluation (C.1):
1. Gemini adapter now retries on 429 + 5xx with exponential backoff
(same pattern already used by OpenRouter/OpenAI). Removes the need
for shell-level retry wrappers when hitting free-tier rate limits.
2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …")
instead of a bare "ERROR", so agents don't have to drop into Python
to diagnose transient failures.
3. --entity/--chapter now respect existing evaluation files by default
(previously only the full-collection pass did). New --force flag
opts into re-evaluation. Stops silently burning free-tier quota on
re-runs of the same slug.
4. --entity accepts hyphenated slugs (matching entity filenames) and
normalizes them to the underscore form used on disk. On a miss the
CLI suggests near matches instead of a bare "not found".
5. eval-summary --update-metrics is no longer destructive:
read_metrics_file/write_metrics_file preserve structured values
(type_distribution) and don't flatten ints to floats. Fixes a
silent data loss observed on every run.
Bonus: the evaluator field in written evaluation frontmatter now
falls back from run_config.model_name to the adapter's resolved model
(or the model echoed back in the API response), so rows no longer
show `evaluator: null` when --model is omitted.
Tests: new tests/unit/llm/test_gemini.py covers retry behavior;
tests/unit/infospace/test_history.py gains a round-trip test that
pins the type_distribution / int-preservation invariants.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -237,7 +237,9 @@ def _entities_by_type(cfg, root: "Path", entity_list: list) -> None:
|
||||
@click.option("--model", default=None, help="LLM model name.")
|
||||
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
|
||||
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
|
||||
def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
@click.option("--force", is_flag=True, default=False,
|
||||
help="Re-evaluate entities whose evaluation file already exists.")
|
||||
def evaluate(config_path, provider, model, entity_slug, chapter, force):
|
||||
"""Evaluate entities using LLM-based quality assessment."""
|
||||
cfg, cfg_path = _load_config_or_exit(config_path)
|
||||
root = cfg_path.parent
|
||||
@@ -252,32 +254,44 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
click.echo("No entities to evaluate.")
|
||||
return
|
||||
|
||||
# Filter
|
||||
# Filter. Accept hyphenated input for --entity by normalizing to the
|
||||
# underscore slug format produced by parse_entity_directory.
|
||||
if entity_slug:
|
||||
entity_list = [e for e in entity_list if e.slug == entity_slug]
|
||||
if not entity_list:
|
||||
click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
|
||||
normalized = entity_slug.replace("-", "_")
|
||||
matches = [e for e in entity_list if e.slug == normalized]
|
||||
if not matches:
|
||||
# Build a short "did you mean…" list from entities sharing a stem.
|
||||
stem = normalized.split("_", 1)[0]
|
||||
near = sorted(e.slug for e in entity_list if e.slug.startswith(stem))[:5]
|
||||
msg = f"Error: Entity '{entity_slug}' not found."
|
||||
if near:
|
||||
msg += f" Did you mean: {', '.join(near)} ?"
|
||||
click.echo(msg, err=True)
|
||||
raise SystemExit(1)
|
||||
entity_list = matches
|
||||
elif chapter:
|
||||
entity_list = [e for e in entity_list if chapter in e.source_chapter]
|
||||
if not entity_list:
|
||||
click.echo(f"No entities found for chapter '{chapter}'.")
|
||||
return
|
||||
|
||||
# Skip entities that already have evaluation files (incremental resume)
|
||||
# Skip entities that already have evaluation files (incremental resume).
|
||||
# Applies uniformly to full-pass, --entity, and --chapter runs unless
|
||||
# --force is set.
|
||||
from markitect.infospace.evaluate import run_entity_evaluation
|
||||
output_dir = root / cfg.evaluations_dir
|
||||
if not entity_slug and not chapter and output_dir.is_dir():
|
||||
previous_digests = {
|
||||
p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator
|
||||
for p in output_dir.glob("*.md")
|
||||
}
|
||||
entity_list = [e for e in entity_list if e.slug not in previous_digests]
|
||||
if not force and output_dir.is_dir():
|
||||
existing = {p.stem for p in output_dir.glob("*.md")}
|
||||
before = len(entity_list)
|
||||
entity_list = [e for e in entity_list if e.slug not in existing]
|
||||
skipped = before - len(entity_list)
|
||||
if not entity_list:
|
||||
click.echo("All entities already evaluated. Nothing to do.")
|
||||
click.echo("All selected entities already evaluated. "
|
||||
"Re-run with --force to overwrite.")
|
||||
return
|
||||
if previous_digests:
|
||||
click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
|
||||
if skipped:
|
||||
click.echo(f"Skipping {skipped} already-evaluated entities. "
|
||||
"Use --force to re-evaluate.")
|
||||
|
||||
# Create adapter
|
||||
from markitect.llm import create_adapter
|
||||
@@ -285,10 +299,14 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
adapter = create_adapter(provider, model=model)
|
||||
run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)
|
||||
|
||||
# Progress callback
|
||||
# Progress callback — surface error detail so agents don't have to
|
||||
# drop into Python to see whether an ERROR was 429, 503, or auth.
|
||||
def on_progress(done, total, result):
|
||||
status = result.status.upper()
|
||||
click.echo(f" [{done}/{total}] {result.key}: {status}")
|
||||
if status == "ERROR" and result.error:
|
||||
click.echo(f" [{done}/{total}] {result.key}: ERROR — {result.error}")
|
||||
else:
|
||||
click.echo(f" [{done}/{total}] {result.key}: {status}")
|
||||
|
||||
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
|
||||
|
||||
|
||||
@@ -195,12 +195,23 @@ def run_entity_evaluation(
|
||||
"""
|
||||
topic = config.topic.name
|
||||
evaluations_path = output_dir or Path(config.evaluations_dir)
|
||||
evaluator_name = (run_config.model_name if run_config else "unknown")
|
||||
# Fall back from run_config.model_name (may be None if the CLI user did
|
||||
# not pass --model) to the adapter's resolved model, and only then to
|
||||
# "unknown". Keeps the evaluator field in the written frontmatter
|
||||
# informative for later audits.
|
||||
default_evaluator = (
|
||||
(run_config.model_name if run_config else None)
|
||||
or getattr(adapter, "_model", None)
|
||||
or "unknown"
|
||||
)
|
||||
|
||||
def _write_and_notify(done: int, total: int, result) -> None:
|
||||
# Write file immediately on success (incremental — run is resumable)
|
||||
if result.status == "success" and result.response is not None:
|
||||
scores = parse_evaluation_response(result.response.content, dimensions)
|
||||
# Prefer the model name the adapter actually echoed back — it
|
||||
# reflects post-resolution fallbacks (e.g. flash → flash-lite).
|
||||
evaluator_name = result.response.model or default_evaluator
|
||||
evaluation = EntityEvaluation(
|
||||
entity_slug=result.key,
|
||||
evaluator=evaluator_name,
|
||||
|
||||
@@ -81,17 +81,26 @@ def snapshot_from_checks(
|
||||
# ── Metrics file I/O ────────────────────────────────────────────────
|
||||
|
||||
|
||||
def write_metrics_file(metrics: Dict[str, float], path: Path) -> None:
|
||||
def write_metrics_file(metrics: Dict[str, Any], path: Path) -> None:
|
||||
"""Write the latest metrics to a simple YAML file.
|
||||
|
||||
This file is used by ``markitect infospace viability`` for quick
|
||||
threshold checking.
|
||||
threshold checking. Non-numeric values (e.g. ``type_distribution``)
|
||||
are passed through unchanged; floats are rounded to 6 dp; ints are
|
||||
preserved as ints so external consumers don't see ``29`` silently
|
||||
become ``29.0`` on every round-trip.
|
||||
"""
|
||||
def _normalize(v: Any) -> Any:
|
||||
if isinstance(v, bool):
|
||||
return v
|
||||
if isinstance(v, float):
|
||||
return round(v, 6)
|
||||
return v
|
||||
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(
|
||||
yaml.safe_dump(
|
||||
{k: round(v, 6) if isinstance(v, float) else v
|
||||
for k, v in sorted(metrics.items())},
|
||||
{k: _normalize(v) for k, v in sorted(metrics.items())},
|
||||
default_flow_style=False,
|
||||
sort_keys=True,
|
||||
),
|
||||
@@ -99,14 +108,20 @@ def write_metrics_file(metrics: Dict[str, float], path: Path) -> None:
|
||||
)
|
||||
|
||||
|
||||
def read_metrics_file(path: Path) -> Dict[str, float]:
|
||||
"""Read the latest metrics from a YAML file."""
|
||||
def read_metrics_file(path: Path) -> Dict[str, Any]:
|
||||
"""Read the latest metrics from a YAML file.
|
||||
|
||||
Returns all keys as written on disk, preserving types verbatim so a
|
||||
round-trip via :func:`write_metrics_file` does not silently drop
|
||||
structured values (e.g. ``type_distribution``) or flatten ints to
|
||||
floats.
|
||||
"""
|
||||
if not path.is_file():
|
||||
return {}
|
||||
raw = yaml.safe_load(path.read_text(encoding="utf-8"))
|
||||
if not isinstance(raw, dict):
|
||||
return {}
|
||||
return {k: float(v) for k, v in raw.items() if isinstance(v, (int, float))}
|
||||
return raw
|
||||
|
||||
|
||||
# ── History operations ───────────────────────────────────────────────
|
||||
|
||||
@@ -9,7 +9,11 @@ from markitect.llm.adapter import LLMAdapter
|
||||
from markitect.llm.models import RunConfig, LLMResponse
|
||||
from markitect.llm.config import resolve_api_key, find_project_root
|
||||
from markitect.llm._http import post_json
|
||||
from markitect.llm.exceptions import LLMConfigurationError
|
||||
from markitect.llm.exceptions import (
|
||||
LLMConfigurationError,
|
||||
LLMAPIError,
|
||||
LLMRateLimitError,
|
||||
)
|
||||
|
||||
_DEFAULT_MODEL = "gemini-2.5-flash"
|
||||
_API_BASE = "https://generativelanguage.googleapis.com/v1beta"
|
||||
@@ -26,10 +30,12 @@ class GeminiAdapter(LLMAdapter):
|
||||
model: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
system_prompt: Optional[str] = None,
|
||||
max_retries: int = 3,
|
||||
**_kwargs: Any,
|
||||
):
|
||||
self._model = model or _DEFAULT_MODEL
|
||||
self._system_prompt = system_prompt
|
||||
self._max_retries = max_retries
|
||||
|
||||
root = find_project_root()
|
||||
key_file_paths = [root / "apikey-geminifree.txt"] if root else []
|
||||
@@ -77,7 +83,7 @@ class GeminiAdapter(LLMAdapter):
|
||||
url = f"{_API_BASE}/models/{model}:generateContent?key={self._api_key}"
|
||||
|
||||
start = time.time()
|
||||
data = post_json(url, payload, timeout=config.timeout_seconds)
|
||||
data = self._post_with_retries(url, payload, timeout=config.timeout_seconds)
|
||||
latency = time.time() - start
|
||||
|
||||
# Parse Gemini response
|
||||
@@ -113,3 +119,27 @@ class GeminiAdapter(LLMAdapter):
|
||||
if not (0.0 <= config.temperature <= 2.0):
|
||||
return False
|
||||
return True
|
||||
|
||||
# ── Internals ───────────────────────────────────────────────────
|
||||
|
||||
def _post_with_retries(
|
||||
self,
|
||||
url: str,
|
||||
payload: Dict[str, Any],
|
||||
timeout: int,
|
||||
) -> Dict[str, Any]:
|
||||
last_exc: Optional[Exception] = None
|
||||
for attempt in range(self._max_retries + 1):
|
||||
try:
|
||||
return post_json(url, payload, timeout=timeout)
|
||||
except LLMRateLimitError as exc:
|
||||
last_exc = exc
|
||||
if attempt < self._max_retries:
|
||||
time.sleep(2 ** attempt)
|
||||
except LLMAPIError as exc:
|
||||
if exc.status_code in (502, 503, 504) and attempt < self._max_retries:
|
||||
last_exc = exc
|
||||
time.sleep(2 ** attempt)
|
||||
else:
|
||||
raise
|
||||
raise last_exc # type: ignore[misc]
|
||||
|
||||
@@ -124,6 +124,33 @@ class TestMetricsFileIO:
|
||||
path.write_text("just a string", encoding="utf-8")
|
||||
assert read_metrics_file(path) == {}
|
||||
|
||||
def test_round_trip_preserves_structured_values(self, tmp_path):
|
||||
"""Non-numeric values like type_distribution must survive a round-trip.
|
||||
|
||||
Regression: eval-summary --update-metrics used to drop any key
|
||||
whose value wasn't a bare number, silently erasing type_distribution
|
||||
from the file on every run.
|
||||
"""
|
||||
path = tmp_path / "metrics.yaml"
|
||||
metrics = {
|
||||
"per_entity_mean": 3.9567,
|
||||
"vsm_type_matrix_cells": 29,
|
||||
"type_distribution": {
|
||||
"Element": 315,
|
||||
"Institution": 122,
|
||||
"Principle": 102,
|
||||
},
|
||||
}
|
||||
write_metrics_file(metrics, path)
|
||||
loaded = read_metrics_file(path)
|
||||
assert loaded["type_distribution"] == {
|
||||
"Element": 315, "Institution": 122, "Principle": 102,
|
||||
}
|
||||
# And the int stayed an int on disk, not 29.0.
|
||||
raw = path.read_text(encoding="utf-8")
|
||||
assert "vsm_type_matrix_cells: 29\n" in raw
|
||||
assert "vsm_type_matrix_cells: 29.0" not in raw
|
||||
|
||||
|
||||
# ── record_check_results ────────────────────────────────────────────
|
||||
|
||||
|
||||
82
tests/unit/llm/test_gemini.py
Normal file
82
tests/unit/llm/test_gemini.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Tests for markitect.llm.gemini — retry behavior + happy path."""
|
||||
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from markitect.llm.gemini import GeminiAdapter
|
||||
from markitect.llm.exceptions import LLMAPIError, LLMRateLimitError
|
||||
from markitect.prompts.execution.models import RunConfig, LLMResponse
|
||||
|
||||
|
||||
def _api_response(text="hello", model="gemini-2.5-flash"):
|
||||
return {
|
||||
"candidates": [
|
||||
{
|
||||
"content": {"parts": [{"text": text}], "role": "model"},
|
||||
"finishReason": "STOP",
|
||||
}
|
||||
],
|
||||
"modelVersion": model,
|
||||
"usageMetadata": {
|
||||
"promptTokenCount": 3,
|
||||
"candidatesTokenCount": 2,
|
||||
"totalTokenCount": 5,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class TestGeminiAdapter:
|
||||
def _adapter(self, **kwargs):
|
||||
defaults = {"api_key": "AIza-test"}
|
||||
defaults.update(kwargs)
|
||||
return GeminiAdapter(**defaults)
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
def test_success(self, mock_post):
|
||||
mock_post.return_value = _api_response("generated")
|
||||
adapter = self._adapter()
|
||||
resp = adapter.execute_prompt("hi", RunConfig())
|
||||
assert isinstance(resp, LLMResponse)
|
||||
assert resp.content == "generated"
|
||||
assert resp.metadata["provider"] == "gemini"
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
@mock.patch("markitect.llm.gemini.time.sleep")
|
||||
def test_retry_on_429(self, mock_sleep, mock_post):
|
||||
mock_post.side_effect = [
|
||||
LLMRateLimitError("rate limited", status_code=429),
|
||||
_api_response("recovered"),
|
||||
]
|
||||
adapter = self._adapter(max_retries=2)
|
||||
resp = adapter.execute_prompt("hi", RunConfig())
|
||||
assert resp.content == "recovered"
|
||||
assert mock_sleep.call_count == 1
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
@mock.patch("markitect.llm.gemini.time.sleep")
|
||||
def test_retry_on_503(self, mock_sleep, mock_post):
|
||||
mock_post.side_effect = [
|
||||
LLMAPIError("unavailable", status_code=503),
|
||||
_api_response("back"),
|
||||
]
|
||||
adapter = self._adapter(max_retries=2)
|
||||
resp = adapter.execute_prompt("hi", RunConfig())
|
||||
assert resp.content == "back"
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
def test_no_retry_on_400(self, mock_post):
|
||||
mock_post.side_effect = LLMAPIError("bad request", status_code=400)
|
||||
adapter = self._adapter(max_retries=2)
|
||||
with pytest.raises(LLMAPIError) as exc_info:
|
||||
adapter.execute_prompt("hi", RunConfig())
|
||||
assert exc_info.value.status_code == 400
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
@mock.patch("markitect.llm.gemini.time.sleep")
|
||||
def test_exhausted_retries_raises(self, mock_sleep, mock_post):
|
||||
mock_post.side_effect = LLMRateLimitError("rate limited", status_code=429)
|
||||
adapter = self._adapter(max_retries=1)
|
||||
with pytest.raises(LLMRateLimitError):
|
||||
adapter.execute_prompt("hi", RunConfig())
|
||||
assert mock_sleep.call_count == 1 # 1 retry before giving up
|
||||
Reference in New Issue
Block a user