diff --git a/markitect/infospace/cli.py b/markitect/infospace/cli.py index 98ba94da..0a8073f5 100644 --- a/markitect/infospace/cli.py +++ b/markitect/infospace/cli.py @@ -237,7 +237,9 @@ def _entities_by_type(cfg, root: "Path", entity_list: list) -> None: @click.option("--model", default=None, help="LLM model name.") @click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.") @click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.") -def evaluate(config_path, provider, model, entity_slug, chapter): +@click.option("--force", is_flag=True, default=False, + help="Re-evaluate entities whose evaluation file already exists.") +def evaluate(config_path, provider, model, entity_slug, chapter, force): """Evaluate entities using LLM-based quality assessment.""" cfg, cfg_path = _load_config_or_exit(config_path) root = cfg_path.parent @@ -252,32 +254,44 @@ def evaluate(config_path, provider, model, entity_slug, chapter): click.echo("No entities to evaluate.") return - # Filter + # Filter. Accept hyphenated input for --entity by normalizing to the + # underscore slug format produced by parse_entity_directory. if entity_slug: - entity_list = [e for e in entity_list if e.slug == entity_slug] - if not entity_list: - click.echo(f"Error: Entity '{entity_slug}' not found.", err=True) + normalized = entity_slug.replace("-", "_") + matches = [e for e in entity_list if e.slug == normalized] + if not matches: + # Build a short "did you mean…" list from entities sharing a stem. + stem = normalized.split("_", 1)[0] + near = sorted(e.slug for e in entity_list if e.slug.startswith(stem))[:5] + msg = f"Error: Entity '{entity_slug}' not found." + if near: + msg += f" Did you mean: {', '.join(near)} ?" + click.echo(msg, err=True) raise SystemExit(1) + entity_list = matches elif chapter: entity_list = [e for e in entity_list if chapter in e.source_chapter] if not entity_list: click.echo(f"No entities found for chapter '{chapter}'.") return - # Skip entities that already have evaluation files (incremental resume) + # Skip entities that already have evaluation files (incremental resume). + # Applies uniformly to full-pass, --entity, and --chapter runs unless + # --force is set. from markitect.infospace.evaluate import run_entity_evaluation output_dir = root / cfg.evaluations_dir - if not entity_slug and not chapter and output_dir.is_dir(): - previous_digests = { - p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator - for p in output_dir.glob("*.md") - } - entity_list = [e for e in entity_list if e.slug not in previous_digests] + if not force and output_dir.is_dir(): + existing = {p.stem for p in output_dir.glob("*.md")} + before = len(entity_list) + entity_list = [e for e in entity_list if e.slug not in existing] + skipped = before - len(entity_list) if not entity_list: - click.echo("All entities already evaluated. Nothing to do.") + click.echo("All selected entities already evaluated. " + "Re-run with --force to overwrite.") return - if previous_digests: - click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.") + if skipped: + click.echo(f"Skipping {skipped} already-evaluated entities. " + "Use --force to re-evaluate.") # Create adapter from markitect.llm import create_adapter @@ -285,10 +299,14 @@ def evaluate(config_path, provider, model, entity_slug, chapter): adapter = create_adapter(provider, model=model) run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000) - # Progress callback + # Progress callback — surface error detail so agents don't have to + # drop into Python to see whether an ERROR was 429, 503, or auth. def on_progress(done, total, result): status = result.status.upper() - click.echo(f" [{done}/{total}] {result.key}: {status}") + if status == "ERROR" and result.error: + click.echo(f" [{done}/{total}] {result.key}: ERROR — {result.error}") + else: + click.echo(f" [{done}/{total}] {result.key}: {status}") click.echo(f"Evaluating {len(entity_list)} entities via {provider}...") diff --git a/markitect/infospace/evaluate.py b/markitect/infospace/evaluate.py index b755f49d..a1886e4d 100644 --- a/markitect/infospace/evaluate.py +++ b/markitect/infospace/evaluate.py @@ -195,12 +195,23 @@ def run_entity_evaluation( """ topic = config.topic.name evaluations_path = output_dir or Path(config.evaluations_dir) - evaluator_name = (run_config.model_name if run_config else "unknown") + # Fall back from run_config.model_name (may be None if the CLI user did + # not pass --model) to the adapter's resolved model, and only then to + # "unknown". Keeps the evaluator field in the written frontmatter + # informative for later audits. + default_evaluator = ( + (run_config.model_name if run_config else None) + or getattr(adapter, "_model", None) + or "unknown" + ) def _write_and_notify(done: int, total: int, result) -> None: # Write file immediately on success (incremental — run is resumable) if result.status == "success" and result.response is not None: scores = parse_evaluation_response(result.response.content, dimensions) + # Prefer the model name the adapter actually echoed back — it + # reflects post-resolution fallbacks (e.g. flash → flash-lite). + evaluator_name = result.response.model or default_evaluator evaluation = EntityEvaluation( entity_slug=result.key, evaluator=evaluator_name, diff --git a/markitect/infospace/history.py b/markitect/infospace/history.py index dca508d4..3f6f6790 100644 --- a/markitect/infospace/history.py +++ b/markitect/infospace/history.py @@ -81,17 +81,26 @@ def snapshot_from_checks( # ── Metrics file I/O ──────────────────────────────────────────────── -def write_metrics_file(metrics: Dict[str, float], path: Path) -> None: +def write_metrics_file(metrics: Dict[str, Any], path: Path) -> None: """Write the latest metrics to a simple YAML file. This file is used by ``markitect infospace viability`` for quick - threshold checking. + threshold checking. Non-numeric values (e.g. ``type_distribution``) + are passed through unchanged; floats are rounded to 6 dp; ints are + preserved as ints so external consumers don't see ``29`` silently + become ``29.0`` on every round-trip. """ + def _normalize(v: Any) -> Any: + if isinstance(v, bool): + return v + if isinstance(v, float): + return round(v, 6) + return v + path.parent.mkdir(parents=True, exist_ok=True) path.write_text( yaml.safe_dump( - {k: round(v, 6) if isinstance(v, float) else v - for k, v in sorted(metrics.items())}, + {k: _normalize(v) for k, v in sorted(metrics.items())}, default_flow_style=False, sort_keys=True, ), @@ -99,14 +108,20 @@ def write_metrics_file(metrics: Dict[str, float], path: Path) -> None: ) -def read_metrics_file(path: Path) -> Dict[str, float]: - """Read the latest metrics from a YAML file.""" +def read_metrics_file(path: Path) -> Dict[str, Any]: + """Read the latest metrics from a YAML file. + + Returns all keys as written on disk, preserving types verbatim so a + round-trip via :func:`write_metrics_file` does not silently drop + structured values (e.g. ``type_distribution``) or flatten ints to + floats. + """ if not path.is_file(): return {} raw = yaml.safe_load(path.read_text(encoding="utf-8")) if not isinstance(raw, dict): return {} - return {k: float(v) for k, v in raw.items() if isinstance(v, (int, float))} + return raw # ── History operations ─────────────────────────────────────────────── diff --git a/markitect/llm/gemini.py b/markitect/llm/gemini.py index f738828e..2a0268b9 100644 --- a/markitect/llm/gemini.py +++ b/markitect/llm/gemini.py @@ -9,7 +9,11 @@ from markitect.llm.adapter import LLMAdapter from markitect.llm.models import RunConfig, LLMResponse from markitect.llm.config import resolve_api_key, find_project_root from markitect.llm._http import post_json -from markitect.llm.exceptions import LLMConfigurationError +from markitect.llm.exceptions import ( + LLMConfigurationError, + LLMAPIError, + LLMRateLimitError, +) _DEFAULT_MODEL = "gemini-2.5-flash" _API_BASE = "https://generativelanguage.googleapis.com/v1beta" @@ -26,10 +30,12 @@ class GeminiAdapter(LLMAdapter): model: Optional[str] = None, api_key: Optional[str] = None, system_prompt: Optional[str] = None, + max_retries: int = 3, **_kwargs: Any, ): self._model = model or _DEFAULT_MODEL self._system_prompt = system_prompt + self._max_retries = max_retries root = find_project_root() key_file_paths = [root / "apikey-geminifree.txt"] if root else [] @@ -77,7 +83,7 @@ class GeminiAdapter(LLMAdapter): url = f"{_API_BASE}/models/{model}:generateContent?key={self._api_key}" start = time.time() - data = post_json(url, payload, timeout=config.timeout_seconds) + data = self._post_with_retries(url, payload, timeout=config.timeout_seconds) latency = time.time() - start # Parse Gemini response @@ -113,3 +119,27 @@ class GeminiAdapter(LLMAdapter): if not (0.0 <= config.temperature <= 2.0): return False return True + + # ── Internals ─────────────────────────────────────────────────── + + def _post_with_retries( + self, + url: str, + payload: Dict[str, Any], + timeout: int, + ) -> Dict[str, Any]: + last_exc: Optional[Exception] = None + for attempt in range(self._max_retries + 1): + try: + return post_json(url, payload, timeout=timeout) + except LLMRateLimitError as exc: + last_exc = exc + if attempt < self._max_retries: + time.sleep(2 ** attempt) + except LLMAPIError as exc: + if exc.status_code in (502, 503, 504) and attempt < self._max_retries: + last_exc = exc + time.sleep(2 ** attempt) + else: + raise + raise last_exc # type: ignore[misc] diff --git a/tests/unit/infospace/test_history.py b/tests/unit/infospace/test_history.py index 3fe466c3..8b46d7c1 100644 --- a/tests/unit/infospace/test_history.py +++ b/tests/unit/infospace/test_history.py @@ -124,6 +124,33 @@ class TestMetricsFileIO: path.write_text("just a string", encoding="utf-8") assert read_metrics_file(path) == {} + def test_round_trip_preserves_structured_values(self, tmp_path): + """Non-numeric values like type_distribution must survive a round-trip. + + Regression: eval-summary --update-metrics used to drop any key + whose value wasn't a bare number, silently erasing type_distribution + from the file on every run. + """ + path = tmp_path / "metrics.yaml" + metrics = { + "per_entity_mean": 3.9567, + "vsm_type_matrix_cells": 29, + "type_distribution": { + "Element": 315, + "Institution": 122, + "Principle": 102, + }, + } + write_metrics_file(metrics, path) + loaded = read_metrics_file(path) + assert loaded["type_distribution"] == { + "Element": 315, "Institution": 122, "Principle": 102, + } + # And the int stayed an int on disk, not 29.0. + raw = path.read_text(encoding="utf-8") + assert "vsm_type_matrix_cells: 29\n" in raw + assert "vsm_type_matrix_cells: 29.0" not in raw + # ── record_check_results ──────────────────────────────────────────── diff --git a/tests/unit/llm/test_gemini.py b/tests/unit/llm/test_gemini.py new file mode 100644 index 00000000..b319b0ba --- /dev/null +++ b/tests/unit/llm/test_gemini.py @@ -0,0 +1,82 @@ +"""Tests for markitect.llm.gemini — retry behavior + happy path.""" + +from unittest import mock + +import pytest + +from markitect.llm.gemini import GeminiAdapter +from markitect.llm.exceptions import LLMAPIError, LLMRateLimitError +from markitect.prompts.execution.models import RunConfig, LLMResponse + + +def _api_response(text="hello", model="gemini-2.5-flash"): + return { + "candidates": [ + { + "content": {"parts": [{"text": text}], "role": "model"}, + "finishReason": "STOP", + } + ], + "modelVersion": model, + "usageMetadata": { + "promptTokenCount": 3, + "candidatesTokenCount": 2, + "totalTokenCount": 5, + }, + } + + +class TestGeminiAdapter: + def _adapter(self, **kwargs): + defaults = {"api_key": "AIza-test"} + defaults.update(kwargs) + return GeminiAdapter(**defaults) + + @mock.patch("markitect.llm.gemini.post_json") + def test_success(self, mock_post): + mock_post.return_value = _api_response("generated") + adapter = self._adapter() + resp = adapter.execute_prompt("hi", RunConfig()) + assert isinstance(resp, LLMResponse) + assert resp.content == "generated" + assert resp.metadata["provider"] == "gemini" + + @mock.patch("markitect.llm.gemini.post_json") + @mock.patch("markitect.llm.gemini.time.sleep") + def test_retry_on_429(self, mock_sleep, mock_post): + mock_post.side_effect = [ + LLMRateLimitError("rate limited", status_code=429), + _api_response("recovered"), + ] + adapter = self._adapter(max_retries=2) + resp = adapter.execute_prompt("hi", RunConfig()) + assert resp.content == "recovered" + assert mock_sleep.call_count == 1 + + @mock.patch("markitect.llm.gemini.post_json") + @mock.patch("markitect.llm.gemini.time.sleep") + def test_retry_on_503(self, mock_sleep, mock_post): + mock_post.side_effect = [ + LLMAPIError("unavailable", status_code=503), + _api_response("back"), + ] + adapter = self._adapter(max_retries=2) + resp = adapter.execute_prompt("hi", RunConfig()) + assert resp.content == "back" + + @mock.patch("markitect.llm.gemini.post_json") + def test_no_retry_on_400(self, mock_post): + mock_post.side_effect = LLMAPIError("bad request", status_code=400) + adapter = self._adapter(max_retries=2) + with pytest.raises(LLMAPIError) as exc_info: + adapter.execute_prompt("hi", RunConfig()) + assert exc_info.value.status_code == 400 + + @mock.patch("markitect.llm.gemini.post_json") + @mock.patch("markitect.llm.gemini.time.sleep") + def test_exhausted_retries_raises(self, mock_sleep, mock_post): + mock_post.side_effect = LLMRateLimitError("rate limited", status_code=429) + adapter = self._adapter(max_retries=1) + with pytest.raises(LLMRateLimitError): + adapter.execute_prompt("hi", RunConfig()) + assert mock_sleep.call_count == 1 # 1 retry before giving up