feat(infospace,llm): stabilize free-tier eval workflow

Five improvements that eliminate most of the agent-in-the-loop friction observed while closing out the 988-entity WoN evaluation (C.1): 1. Gemini adapter now retries on 429 + 5xx with exponential backoff (same pattern already used by OpenRouter/OpenAI). Removes the need for shell-level retry wrappers when hitting free-tier rate limits. 2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …") instead of a bare "ERROR", so agents don't have to drop into Python to diagnose transient failures. 3. --entity/--chapter now respect existing evaluation files by default (previously only the full-collection pass did). New --force flag opts into re-evaluation. Stops silently burning free-tier quota on re-runs of the same slug. 4. --entity accepts hyphenated slugs (matching entity filenames) and normalizes them to the underscore form used on disk. On a miss the CLI suggests near matches instead of a bare "not found". 5. eval-summary --update-metrics is no longer destructive: read_metrics_file/write_metrics_file preserve structured values (type_distribution) and don't flatten ints to floats. Fixes a silent data loss observed on every run. Bonus: the evaluator field in written evaluation frontmatter now falls back from run_config.model_name to the adapter's resolved model (or the model echoed back in the API response), so rows no longer show `evaluator: null` when --model is omitted. Tests: new tests/unit/llm/test_gemini.py covers retry behavior; tests/unit/infospace/test_history.py gains a round-trip test that pins the type_distribution / int-preservation invariants. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 00:51:00 +02:00
parent 965508ec06
commit c0615c2d50
6 changed files with 210 additions and 27 deletions
--- a/tests/unit/infospace/test_history.py
+++ b/tests/unit/infospace/test_history.py
@@ -124,6 +124,33 @@ class TestMetricsFileIO:
        path.write_text("just a string", encoding="utf-8")
        assert read_metrics_file(path) == {}

+    def test_round_trip_preserves_structured_values(self, tmp_path):
+        """Non-numeric values like type_distribution must survive a round-trip.
+
+        Regression: eval-summary --update-metrics used to drop any key
+        whose value wasn't a bare number, silently erasing type_distribution
+        from the file on every run.
+        """
+        path = tmp_path / "metrics.yaml"
+        metrics = {
+            "per_entity_mean": 3.9567,
+            "vsm_type_matrix_cells": 29,
+            "type_distribution": {
+                "Element": 315,
+                "Institution": 122,
+                "Principle": 102,
+            },
+        }
+        write_metrics_file(metrics, path)
+        loaded = read_metrics_file(path)
+        assert loaded["type_distribution"] == {
+            "Element": 315, "Institution": 122, "Principle": 102,
+        }
+        # And the int stayed an int on disk, not 29.0.
+        raw = path.read_text(encoding="utf-8")
+        assert "vsm_type_matrix_cells: 29\n" in raw
+        assert "vsm_type_matrix_cells: 29.0" not in raw
+

 # ── record_check_results ────────────────────────────────────────────

--- a/tests/unit/llm/test_gemini.py
+++ b/tests/unit/llm/test_gemini.py
@@ -0,0 +1,82 @@
+"""Tests for markitect.llm.gemini — retry behavior + happy path."""
+
+from unittest import mock
+
+import pytest
+
+from markitect.llm.gemini import GeminiAdapter
+from markitect.llm.exceptions import LLMAPIError, LLMRateLimitError
+from markitect.prompts.execution.models import RunConfig, LLMResponse
+
+
+def _api_response(text="hello", model="gemini-2.5-flash"):
+    return {
+        "candidates": [
+            {
+                "content": {"parts": [{"text": text}], "role": "model"},
+                "finishReason": "STOP",
+            }
+        ],
+        "modelVersion": model,
+        "usageMetadata": {
+            "promptTokenCount": 3,
+            "candidatesTokenCount": 2,
+            "totalTokenCount": 5,
+        },
+    }
+
+
+class TestGeminiAdapter:
+    def _adapter(self, **kwargs):
+        defaults = {"api_key": "AIza-test"}
+        defaults.update(kwargs)
+        return GeminiAdapter(**defaults)
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    def test_success(self, mock_post):
+        mock_post.return_value = _api_response("generated")
+        adapter = self._adapter()
+        resp = adapter.execute_prompt("hi", RunConfig())
+        assert isinstance(resp, LLMResponse)
+        assert resp.content == "generated"
+        assert resp.metadata["provider"] == "gemini"
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    @mock.patch("markitect.llm.gemini.time.sleep")
+    def test_retry_on_429(self, mock_sleep, mock_post):
+        mock_post.side_effect = [
+            LLMRateLimitError("rate limited", status_code=429),
+            _api_response("recovered"),
+        ]
+        adapter = self._adapter(max_retries=2)
+        resp = adapter.execute_prompt("hi", RunConfig())
+        assert resp.content == "recovered"
+        assert mock_sleep.call_count == 1
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    @mock.patch("markitect.llm.gemini.time.sleep")
+    def test_retry_on_503(self, mock_sleep, mock_post):
+        mock_post.side_effect = [
+            LLMAPIError("unavailable", status_code=503),
+            _api_response("back"),
+        ]
+        adapter = self._adapter(max_retries=2)
+        resp = adapter.execute_prompt("hi", RunConfig())
+        assert resp.content == "back"
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    def test_no_retry_on_400(self, mock_post):
+        mock_post.side_effect = LLMAPIError("bad request", status_code=400)
+        adapter = self._adapter(max_retries=2)
+        with pytest.raises(LLMAPIError) as exc_info:
+            adapter.execute_prompt("hi", RunConfig())
+        assert exc_info.value.status_code == 400
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    @mock.patch("markitect.llm.gemini.time.sleep")
+    def test_exhausted_retries_raises(self, mock_sleep, mock_post):
+        mock_post.side_effect = LLMRateLimitError("rate limited", status_code=429)
+        adapter = self._adapter(max_retries=1)
+        with pytest.raises(LLMRateLimitError):
+            adapter.execute_prompt("hi", RunConfig())
+        assert mock_sleep.call_count == 1  # 1 retry before giving up