feat(infospace,llm): stabilize free-tier eval workflow

Five improvements that eliminate most of the agent-in-the-loop friction observed while closing out the 988-entity WoN evaluation (C.1): 1. Gemini adapter now retries on 429 + 5xx with exponential backoff (same pattern already used by OpenRouter/OpenAI). Removes the need for shell-level retry wrappers when hitting free-tier rate limits. 2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …") instead of a bare "ERROR", so agents don't have to drop into Python to diagnose transient failures. 3. --entity/--chapter now respect existing evaluation files by default (previously only the full-collection pass did). New --force flag opts into re-evaluation. Stops silently burning free-tier quota on re-runs of the same slug. 4. --entity accepts hyphenated slugs (matching entity filenames) and normalizes them to the underscore form used on disk. On a miss the CLI suggests near matches instead of a bare "not found". 5. eval-summary --update-metrics is no longer destructive: read_metrics_file/write_metrics_file preserve structured values (type_distribution) and don't flatten ints to floats. Fixes a silent data loss observed on every run. Bonus: the evaluator field in written evaluation frontmatter now falls back from run_config.model_name to the adapter's resolved model (or the model echoed back in the API response), so rows no longer show `evaluator: null` when --model is omitted. Tests: new tests/unit/llm/test_gemini.py covers retry behavior; tests/unit/infospace/test_history.py gains a round-trip test that pins the type_distribution / int-preservation invariants. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 00:51:00 +02:00
parent 965508ec06
commit c0615c2d50
6 changed files with 210 additions and 27 deletions
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -237,7 +237,9 @@ def _entities_by_type(cfg, root: "Path", entity_list: list) -> None:
@click.option("--model", default=None, help="LLM model name.")
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
-def evaluate(config_path, provider, model, entity_slug, chapter):
+@click.option("--force", is_flag=True, default=False,
+              help="Re-evaluate entities whose evaluation file already exists.")
+def evaluate(config_path, provider, model, entity_slug, chapter, force):
    """Evaluate entities using LLM-based quality assessment."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent
@@ -252,32 +254,44 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
        click.echo("No entities to evaluate.")
        return

-    # Filter
+    # Filter. Accept hyphenated input for --entity by normalizing to the
+    # underscore slug format produced by parse_entity_directory.
    if entity_slug:
-        entity_list = [e for e in entity_list if e.slug == entity_slug]
-        if not entity_list:
-            click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
+        normalized = entity_slug.replace("-", "_")
+        matches = [e for e in entity_list if e.slug == normalized]
+        if not matches:
+            # Build a short "did you mean…" list from entities sharing a stem.
+            stem = normalized.split("_", 1)[0]
+            near = sorted(e.slug for e in entity_list if e.slug.startswith(stem))[:5]
+            msg = f"Error: Entity '{entity_slug}' not found."
+            if near:
+                msg += f" Did you mean: {', '.join(near)} ?"
+            click.echo(msg, err=True)
            raise SystemExit(1)
+        entity_list = matches
    elif chapter:
        entity_list = [e for e in entity_list if chapter in e.source_chapter]
        if not entity_list:
            click.echo(f"No entities found for chapter '{chapter}'.")
            return

-    # Skip entities that already have evaluation files (incremental resume)
+    # Skip entities that already have evaluation files (incremental resume).
+    # Applies uniformly to full-pass, --entity, and --chapter runs unless
+    # --force is set.
    from markitect.infospace.evaluate import run_entity_evaluation
    output_dir = root / cfg.evaluations_dir
-    if not entity_slug and not chapter and output_dir.is_dir():
-        previous_digests = {
-            p.stem: ""  # non-empty sentinel → triggers skip in BatchEvaluator
-            for p in output_dir.glob("*.md")
-        }
-        entity_list = [e for e in entity_list if e.slug not in previous_digests]
+    if not force and output_dir.is_dir():
+        existing = {p.stem for p in output_dir.glob("*.md")}
+        before = len(entity_list)
+        entity_list = [e for e in entity_list if e.slug not in existing]
+        skipped = before - len(entity_list)
        if not entity_list:
-            click.echo("All entities already evaluated. Nothing to do.")
+            click.echo("All selected entities already evaluated. "
+                       "Re-run with --force to overwrite.")
            return
-        if previous_digests:
-            click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
+        if skipped:
+            click.echo(f"Skipping {skipped} already-evaluated entities. "
+                       "Use --force to re-evaluate.")

    # Create adapter
    from markitect.llm import create_adapter
@@ -285,10 +299,14 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
    adapter = create_adapter(provider, model=model)
    run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)

-    # Progress callback
+    # Progress callback — surface error detail so agents don't have to
+    # drop into Python to see whether an ERROR was 429, 503, or auth.
    def on_progress(done, total, result):
        status = result.status.upper()
-        click.echo(f"  [{done}/{total}] {result.key}: {status}")
+        if status == "ERROR" and result.error:
+            click.echo(f"  [{done}/{total}] {result.key}: ERROR — {result.error}")
+        else:
+            click.echo(f"  [{done}/{total}] {result.key}: {status}")

    click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")

--- a/markitect/infospace/evaluate.py
+++ b/markitect/infospace/evaluate.py
@@ -195,12 +195,23 @@ def run_entity_evaluation(
    """
    topic = config.topic.name
    evaluations_path = output_dir or Path(config.evaluations_dir)
-    evaluator_name = (run_config.model_name if run_config else "unknown")
+    # Fall back from run_config.model_name (may be None if the CLI user did
+    # not pass --model) to the adapter's resolved model, and only then to
+    # "unknown". Keeps the evaluator field in the written frontmatter
+    # informative for later audits.
+    default_evaluator = (
+        (run_config.model_name if run_config else None)
+        or getattr(adapter, "_model", None)
+        or "unknown"
+    )

    def _write_and_notify(done: int, total: int, result) -> None:
        # Write file immediately on success (incremental — run is resumable)
        if result.status == "success" and result.response is not None:
            scores = parse_evaluation_response(result.response.content, dimensions)
+            # Prefer the model name the adapter actually echoed back — it
+            # reflects post-resolution fallbacks (e.g. flash → flash-lite).
+            evaluator_name = result.response.model or default_evaluator
            evaluation = EntityEvaluation(
                entity_slug=result.key,
                evaluator=evaluator_name,
--- a/markitect/infospace/history.py
+++ b/markitect/infospace/history.py
@@ -81,17 +81,26 @@ def snapshot_from_checks(
 # ── Metrics file I/O ────────────────────────────────────────────────


-def write_metrics_file(metrics: Dict[str, float], path: Path) -> None:
+def write_metrics_file(metrics: Dict[str, Any], path: Path) -> None:
    """Write the latest metrics to a simple YAML file.

    This file is used by ``markitect infospace viability`` for quick
-    threshold checking.
+    threshold checking. Non-numeric values (e.g. ``type_distribution``)
+    are passed through unchanged; floats are rounded to 6 dp; ints are
+    preserved as ints so external consumers don't see ``29`` silently
+    become ``29.0`` on every round-trip.
    """
+    def _normalize(v: Any) -> Any:
+        if isinstance(v, bool):
+            return v
+        if isinstance(v, float):
+            return round(v, 6)
+        return v
+
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(
        yaml.safe_dump(
-            {k: round(v, 6) if isinstance(v, float) else v
-             for k, v in sorted(metrics.items())},
+            {k: _normalize(v) for k, v in sorted(metrics.items())},
            default_flow_style=False,
            sort_keys=True,
        ),
@@ -99,14 +108,20 @@ def write_metrics_file(metrics: Dict[str, float], path: Path) -> None:
    )


-def read_metrics_file(path: Path) -> Dict[str, float]:
-    """Read the latest metrics from a YAML file."""
+def read_metrics_file(path: Path) -> Dict[str, Any]:
+    """Read the latest metrics from a YAML file.
+
+    Returns all keys as written on disk, preserving types verbatim so a
+    round-trip via :func:`write_metrics_file` does not silently drop
+    structured values (e.g. ``type_distribution``) or flatten ints to
+    floats.
+    """
    if not path.is_file():
        return {}
    raw = yaml.safe_load(path.read_text(encoding="utf-8"))
    if not isinstance(raw, dict):
        return {}
-    return {k: float(v) for k, v in raw.items() if isinstance(v, (int, float))}
+    return raw


 # ── History operations ───────────────────────────────────────────────
--- a/markitect/llm/gemini.py
+++ b/markitect/llm/gemini.py
@@ -9,7 +9,11 @@ from markitect.llm.adapter import LLMAdapter
 from markitect.llm.models import RunConfig, LLMResponse
 from markitect.llm.config import resolve_api_key, find_project_root
 from markitect.llm._http import post_json
-from markitect.llm.exceptions import LLMConfigurationError
+from markitect.llm.exceptions import (
+    LLMConfigurationError,
+    LLMAPIError,
+    LLMRateLimitError,
+)

 _DEFAULT_MODEL = "gemini-2.5-flash"
 _API_BASE = "https://generativelanguage.googleapis.com/v1beta"
@@ -26,10 +30,12 @@ class GeminiAdapter(LLMAdapter):
        model: Optional[str] = None,
        api_key: Optional[str] = None,
        system_prompt: Optional[str] = None,
+        max_retries: int = 3,
        **_kwargs: Any,
    ):
        self._model = model or _DEFAULT_MODEL
        self._system_prompt = system_prompt
+        self._max_retries = max_retries

        root = find_project_root()
        key_file_paths = [root / "apikey-geminifree.txt"] if root else []
@@ -77,7 +83,7 @@ class GeminiAdapter(LLMAdapter):
        url = f"{_API_BASE}/models/{model}:generateContent?key={self._api_key}"

        start = time.time()
-        data = post_json(url, payload, timeout=config.timeout_seconds)
+        data = self._post_with_retries(url, payload, timeout=config.timeout_seconds)
        latency = time.time() - start

        # Parse Gemini response
@@ -113,3 +119,27 @@ class GeminiAdapter(LLMAdapter):
        if not (0.0 <= config.temperature <= 2.0):
            return False
        return True
+
+    # ── Internals ───────────────────────────────────────────────────
+
+    def _post_with_retries(
+        self,
+        url: str,
+        payload: Dict[str, Any],
+        timeout: int,
+    ) -> Dict[str, Any]:
+        last_exc: Optional[Exception] = None
+        for attempt in range(self._max_retries + 1):
+            try:
+                return post_json(url, payload, timeout=timeout)
+            except LLMRateLimitError as exc:
+                last_exc = exc
+                if attempt < self._max_retries:
+                    time.sleep(2 ** attempt)
+            except LLMAPIError as exc:
+                if exc.status_code in (502, 503, 504) and attempt < self._max_retries:
+                    last_exc = exc
+                    time.sleep(2 ** attempt)
+                else:
+                    raise
+        raise last_exc  # type: ignore[misc]
--- a/tests/unit/infospace/test_history.py
+++ b/tests/unit/infospace/test_history.py
@@ -124,6 +124,33 @@ class TestMetricsFileIO:
        path.write_text("just a string", encoding="utf-8")
        assert read_metrics_file(path) == {}

+    def test_round_trip_preserves_structured_values(self, tmp_path):
+        """Non-numeric values like type_distribution must survive a round-trip.
+
+        Regression: eval-summary --update-metrics used to drop any key
+        whose value wasn't a bare number, silently erasing type_distribution
+        from the file on every run.
+        """
+        path = tmp_path / "metrics.yaml"
+        metrics = {
+            "per_entity_mean": 3.9567,
+            "vsm_type_matrix_cells": 29,
+            "type_distribution": {
+                "Element": 315,
+                "Institution": 122,
+                "Principle": 102,
+            },
+        }
+        write_metrics_file(metrics, path)
+        loaded = read_metrics_file(path)
+        assert loaded["type_distribution"] == {
+            "Element": 315, "Institution": 122, "Principle": 102,
+        }
+        # And the int stayed an int on disk, not 29.0.
+        raw = path.read_text(encoding="utf-8")
+        assert "vsm_type_matrix_cells: 29\n" in raw
+        assert "vsm_type_matrix_cells: 29.0" not in raw
+

 # ── record_check_results ────────────────────────────────────────────

--- a/tests/unit/llm/test_gemini.py
+++ b/tests/unit/llm/test_gemini.py
@@ -0,0 +1,82 @@
+"""Tests for markitect.llm.gemini — retry behavior + happy path."""
+
+from unittest import mock
+
+import pytest
+
+from markitect.llm.gemini import GeminiAdapter
+from markitect.llm.exceptions import LLMAPIError, LLMRateLimitError
+from markitect.prompts.execution.models import RunConfig, LLMResponse
+
+
+def _api_response(text="hello", model="gemini-2.5-flash"):
+    return {
+        "candidates": [
+            {
+                "content": {"parts": [{"text": text}], "role": "model"},
+                "finishReason": "STOP",
+            }
+        ],
+        "modelVersion": model,
+        "usageMetadata": {
+            "promptTokenCount": 3,
+            "candidatesTokenCount": 2,
+            "totalTokenCount": 5,
+        },
+    }
+
+
+class TestGeminiAdapter:
+    def _adapter(self, **kwargs):
+        defaults = {"api_key": "AIza-test"}
+        defaults.update(kwargs)
+        return GeminiAdapter(**defaults)
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    def test_success(self, mock_post):
+        mock_post.return_value = _api_response("generated")
+        adapter = self._adapter()
+        resp = adapter.execute_prompt("hi", RunConfig())
+        assert isinstance(resp, LLMResponse)
+        assert resp.content == "generated"
+        assert resp.metadata["provider"] == "gemini"
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    @mock.patch("markitect.llm.gemini.time.sleep")
+    def test_retry_on_429(self, mock_sleep, mock_post):
+        mock_post.side_effect = [
+            LLMRateLimitError("rate limited", status_code=429),
+            _api_response("recovered"),
+        ]
+        adapter = self._adapter(max_retries=2)
+        resp = adapter.execute_prompt("hi", RunConfig())
+        assert resp.content == "recovered"
+        assert mock_sleep.call_count == 1
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    @mock.patch("markitect.llm.gemini.time.sleep")
+    def test_retry_on_503(self, mock_sleep, mock_post):
+        mock_post.side_effect = [
+            LLMAPIError("unavailable", status_code=503),
+            _api_response("back"),
+        ]
+        adapter = self._adapter(max_retries=2)
+        resp = adapter.execute_prompt("hi", RunConfig())
+        assert resp.content == "back"
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    def test_no_retry_on_400(self, mock_post):
+        mock_post.side_effect = LLMAPIError("bad request", status_code=400)
+        adapter = self._adapter(max_retries=2)
+        with pytest.raises(LLMAPIError) as exc_info:
+            adapter.execute_prompt("hi", RunConfig())
+        assert exc_info.value.status_code == 400
+
+    @mock.patch("markitect.llm.gemini.post_json")
+    @mock.patch("markitect.llm.gemini.time.sleep")
+    def test_exhausted_retries_raises(self, mock_sleep, mock_post):
+        mock_post.side_effect = LLMRateLimitError("rate limited", status_code=429)
+        adapter = self._adapter(max_retries=1)
+        with pytest.raises(LLMRateLimitError):
+            adapter.execute_prompt("hi", RunConfig())
+        assert mock_sleep.call_count == 1  # 1 retry before giving up