feat(infospace,llm): stabilize free-tier eval workflow
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Five improvements that eliminate most of the agent-in-the-loop friction
observed while closing out the 988-entity WoN evaluation (C.1):
1. Gemini adapter now retries on 429 + 5xx with exponential backoff
(same pattern already used by OpenRouter/OpenAI). Removes the need
for shell-level retry wrappers when hitting free-tier rate limits.
2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …")
instead of a bare "ERROR", so agents don't have to drop into Python
to diagnose transient failures.
3. --entity/--chapter now respect existing evaluation files by default
(previously only the full-collection pass did). New --force flag
opts into re-evaluation. Stops silently burning free-tier quota on
re-runs of the same slug.
4. --entity accepts hyphenated slugs (matching entity filenames) and
normalizes them to the underscore form used on disk. On a miss the
CLI suggests near matches instead of a bare "not found".
5. eval-summary --update-metrics is no longer destructive:
read_metrics_file/write_metrics_file preserve structured values
(type_distribution) and don't flatten ints to floats. Fixes a
silent data loss observed on every run.
Bonus: the evaluator field in written evaluation frontmatter now
falls back from run_config.model_name to the adapter's resolved model
(or the model echoed back in the API response), so rows no longer
show `evaluator: null` when --model is omitted.
Tests: new tests/unit/llm/test_gemini.py covers retry behavior;
tests/unit/infospace/test_history.py gains a round-trip test that
pins the type_distribution / int-preservation invariants.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -124,6 +124,33 @@ class TestMetricsFileIO:
|
||||
path.write_text("just a string", encoding="utf-8")
|
||||
assert read_metrics_file(path) == {}
|
||||
|
||||
def test_round_trip_preserves_structured_values(self, tmp_path):
|
||||
"""Non-numeric values like type_distribution must survive a round-trip.
|
||||
|
||||
Regression: eval-summary --update-metrics used to drop any key
|
||||
whose value wasn't a bare number, silently erasing type_distribution
|
||||
from the file on every run.
|
||||
"""
|
||||
path = tmp_path / "metrics.yaml"
|
||||
metrics = {
|
||||
"per_entity_mean": 3.9567,
|
||||
"vsm_type_matrix_cells": 29,
|
||||
"type_distribution": {
|
||||
"Element": 315,
|
||||
"Institution": 122,
|
||||
"Principle": 102,
|
||||
},
|
||||
}
|
||||
write_metrics_file(metrics, path)
|
||||
loaded = read_metrics_file(path)
|
||||
assert loaded["type_distribution"] == {
|
||||
"Element": 315, "Institution": 122, "Principle": 102,
|
||||
}
|
||||
# And the int stayed an int on disk, not 29.0.
|
||||
raw = path.read_text(encoding="utf-8")
|
||||
assert "vsm_type_matrix_cells: 29\n" in raw
|
||||
assert "vsm_type_matrix_cells: 29.0" not in raw
|
||||
|
||||
|
||||
# ── record_check_results ────────────────────────────────────────────
|
||||
|
||||
|
||||
82
tests/unit/llm/test_gemini.py
Normal file
82
tests/unit/llm/test_gemini.py
Normal file
@@ -0,0 +1,82 @@
|
||||
"""Tests for markitect.llm.gemini — retry behavior + happy path."""
|
||||
|
||||
from unittest import mock
|
||||
|
||||
import pytest
|
||||
|
||||
from markitect.llm.gemini import GeminiAdapter
|
||||
from markitect.llm.exceptions import LLMAPIError, LLMRateLimitError
|
||||
from markitect.prompts.execution.models import RunConfig, LLMResponse
|
||||
|
||||
|
||||
def _api_response(text="hello", model="gemini-2.5-flash"):
|
||||
return {
|
||||
"candidates": [
|
||||
{
|
||||
"content": {"parts": [{"text": text}], "role": "model"},
|
||||
"finishReason": "STOP",
|
||||
}
|
||||
],
|
||||
"modelVersion": model,
|
||||
"usageMetadata": {
|
||||
"promptTokenCount": 3,
|
||||
"candidatesTokenCount": 2,
|
||||
"totalTokenCount": 5,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class TestGeminiAdapter:
|
||||
def _adapter(self, **kwargs):
|
||||
defaults = {"api_key": "AIza-test"}
|
||||
defaults.update(kwargs)
|
||||
return GeminiAdapter(**defaults)
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
def test_success(self, mock_post):
|
||||
mock_post.return_value = _api_response("generated")
|
||||
adapter = self._adapter()
|
||||
resp = adapter.execute_prompt("hi", RunConfig())
|
||||
assert isinstance(resp, LLMResponse)
|
||||
assert resp.content == "generated"
|
||||
assert resp.metadata["provider"] == "gemini"
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
@mock.patch("markitect.llm.gemini.time.sleep")
|
||||
def test_retry_on_429(self, mock_sleep, mock_post):
|
||||
mock_post.side_effect = [
|
||||
LLMRateLimitError("rate limited", status_code=429),
|
||||
_api_response("recovered"),
|
||||
]
|
||||
adapter = self._adapter(max_retries=2)
|
||||
resp = adapter.execute_prompt("hi", RunConfig())
|
||||
assert resp.content == "recovered"
|
||||
assert mock_sleep.call_count == 1
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
@mock.patch("markitect.llm.gemini.time.sleep")
|
||||
def test_retry_on_503(self, mock_sleep, mock_post):
|
||||
mock_post.side_effect = [
|
||||
LLMAPIError("unavailable", status_code=503),
|
||||
_api_response("back"),
|
||||
]
|
||||
adapter = self._adapter(max_retries=2)
|
||||
resp = adapter.execute_prompt("hi", RunConfig())
|
||||
assert resp.content == "back"
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
def test_no_retry_on_400(self, mock_post):
|
||||
mock_post.side_effect = LLMAPIError("bad request", status_code=400)
|
||||
adapter = self._adapter(max_retries=2)
|
||||
with pytest.raises(LLMAPIError) as exc_info:
|
||||
adapter.execute_prompt("hi", RunConfig())
|
||||
assert exc_info.value.status_code == 400
|
||||
|
||||
@mock.patch("markitect.llm.gemini.post_json")
|
||||
@mock.patch("markitect.llm.gemini.time.sleep")
|
||||
def test_exhausted_retries_raises(self, mock_sleep, mock_post):
|
||||
mock_post.side_effect = LLMRateLimitError("rate limited", status_code=429)
|
||||
adapter = self._adapter(max_retries=1)
|
||||
with pytest.raises(LLMRateLimitError):
|
||||
adapter.execute_prompt("hi", RunConfig())
|
||||
assert mock_sleep.call_count == 1 # 1 retry before giving up
|
||||
Reference in New Issue
Block a user