From 3461d2f354febb0501ecf941ada9ccd99535ea01 Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 19 Feb 2026 01:48:34 +0100 Subject: [PATCH] feat(infospace): add per-entity evaluation pipeline and CLI command (S2.3) Evaluation pipeline builds prompts from entity metadata, delegates to BatchEvaluator, parses structured LLM responses into ScoreEntry objects, and writes evaluation files. CLI: 'markitect infospace evaluate' with --provider, --entity, --chapter filters. Co-Authored-By: Claude Opus 4.6 --- markitect/infospace/cli.py | 65 ++++++++ markitect/infospace/evaluate.py | 215 ++++++++++++++++++++++++ tests/unit/infospace/test_evaluate.py | 224 ++++++++++++++++++++++++++ 3 files changed, 504 insertions(+) create mode 100644 markitect/infospace/evaluate.py create mode 100644 tests/unit/infospace/test_evaluate.py diff --git a/markitect/infospace/cli.py b/markitect/infospace/cli.py index 57d89428..c158187b 100644 --- a/markitect/infospace/cli.py +++ b/markitect/infospace/cli.py @@ -153,6 +153,71 @@ def entities(config_path: Optional[str], sort_key: str): click.echo(f"\nTotal: {len(entity_list)} entities") +# ── evaluate ───────────────────────────────────────────────────────── + + +@infospace_commands.command() +@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.") +@click.option("--provider", default="openrouter", help="LLM provider (openrouter, openai, etc.).") +@click.option("--model", default=None, help="LLM model name.") +@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.") +@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.") +def evaluate(config_path, provider, model, entity_slug, chapter): + """Evaluate entities using LLM-based quality assessment.""" + cfg, cfg_path = _load_config_or_exit(config_path) + root = cfg_path.parent + + entities_dir = root / cfg.entities_dir + if not entities_dir.is_dir(): + click.echo("Error: No entities directory found.", err=True) + raise SystemExit(1) + + entity_list = parse_entity_directory(entities_dir) + if not entity_list: + click.echo("No entities to evaluate.") + return + + # Filter + if entity_slug: + entity_list = [e for e in entity_list if e.slug == entity_slug] + if not entity_list: + click.echo(f"Error: Entity '{entity_slug}' not found.", err=True) + raise SystemExit(1) + elif chapter: + entity_list = [e for e in entity_list if chapter in e.source_chapter] + if not entity_list: + click.echo(f"No entities found for chapter '{chapter}'.") + return + + # Create adapter + from markitect.llm import create_adapter + from markitect.prompts.execution.models import RunConfig + adapter = create_adapter(provider, model=model) + run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000) + + # Progress callback + def on_progress(done, total, result): + status = result.status.upper() + click.echo(f" [{done}/{total}] {result.key}: {status}") + + click.echo(f"Evaluating {len(entity_list)} entities via {provider}...") + + from markitect.infospace.evaluate import run_entity_evaluation + output_dir = root / cfg.evaluations_dir + summary = run_entity_evaluation( + config=cfg, + entities=entity_list, + adapter=adapter, + run_config=run_config, + output_dir=output_dir, + progress_callback=on_progress, + ) + + click.echo(f"\nDone: {summary.succeeded} succeeded, {summary.failed} failed, {summary.skipped} skipped") + if summary.total_tokens > 0: + click.echo(f"Tokens used: {summary.total_tokens}") + + # ── viability ──────────────────────────────────────────────────────── diff --git a/markitect/infospace/evaluate.py b/markitect/infospace/evaluate.py new file mode 100644 index 00000000..b33e9118 --- /dev/null +++ b/markitect/infospace/evaluate.py @@ -0,0 +1,215 @@ +""" +Per-entity evaluation pipeline. + +Builds prompts from entity metadata and delegates LLM evaluation to +the :class:`BatchEvaluator`. Writes structured results to the +evaluations directory. +""" + +from __future__ import annotations + +import hashlib +from datetime import datetime +from pathlib import Path +from typing import Callable, Dict, List, Optional + +from markitect.infospace.config import InfospaceConfig +from markitect.infospace.evaluation import EntityEvaluation, ScoreEntry +from markitect.infospace.evaluation_io import write_entity_evaluation +from markitect.infospace.models import EntityMeta +from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary +from markitect.prompts.execution.llm_adapter import LLMAdapter +from markitect.prompts.execution.models import RunConfig + + +_DEFAULT_DIMENSIONS = [ + "definition_precision", + "source_grounding", + "domain_relevance", + "discipline_alignment", + "conceptual_clarity", +] + +_PROMPT_TEMPLATE = """\ +You are evaluating an entity from an infospace about "{topic}". + +## Entity: {title} + +**Slug:** {slug} +**Domain:** {domain} +**Source chapter:** {source_chapter} + +### Definition +{definition} + +### Context +{context} + +## Instructions + +Rate this entity on each dimension below using a scale of 1-5 \ +(1 = poor, 5 = excellent). For each dimension, provide: +1. A numeric score (1-5) +2. A brief rationale (1-2 sentences) + +### Dimensions to evaluate: +{dimensions_list} + +## Output format + +Return your evaluation as a structured list: + +DIMENSION: +SCORE: <1-5> +RATIONALE: + +Repeat for each dimension. +""" + + +def build_evaluation_prompt( + entity: EntityMeta, + topic: str, + dimensions: Optional[List[str]] = None, +) -> str: + """Build an evaluation prompt for a single entity.""" + dims = dimensions or _DEFAULT_DIMENSIONS + dims_list = "\n".join(f"- {d}" for d in dims) + return _PROMPT_TEMPLATE.format( + topic=topic, + title=entity.title, + slug=entity.slug, + domain=entity.domain or "(unspecified)", + source_chapter=entity.source_chapter or "(unspecified)", + definition=entity.definition or "(no definition)", + context=entity.context or "(no context)", + dimensions_list=dims_list, + ) + + +def content_digest(entity: EntityMeta) -> str: + """Compute a content digest for incremental evaluation.""" + content = f"{entity.slug}:{entity.definition}:{entity.context}:{entity.domain}" + return hashlib.sha256(content.encode()).hexdigest()[:16] + + +def parse_evaluation_response( + response_text: str, + dimensions: Optional[List[str]] = None, +) -> List[ScoreEntry]: + """Parse structured dimension scores from LLM response text. + + Expects blocks of:: + + DIMENSION: + SCORE: <1-5> + RATIONALE: + """ + dims = dimensions or _DEFAULT_DIMENSIONS + scores: List[ScoreEntry] = [] + current_dim = None + current_score = None + current_rationale = "" + + for line in response_text.splitlines(): + stripped = line.strip() + if stripped.upper().startswith("DIMENSION:"): + # Flush previous + if current_dim is not None and current_score is not None: + scores.append(ScoreEntry( + name=current_dim, + value=current_score, + max_value=5.0, + rationale=current_rationale.strip(), + )) + current_dim = stripped.split(":", 1)[1].strip() + current_score = None + current_rationale = "" + elif stripped.upper().startswith("SCORE:"): + try: + current_score = float(stripped.split(":", 1)[1].strip()) + except ValueError: + current_score = None + elif stripped.upper().startswith("RATIONALE:"): + current_rationale = stripped.split(":", 1)[1].strip() + elif current_dim is not None and current_score is not None: + # Continuation of rationale + if stripped: + current_rationale += " " + stripped + + # Flush last + if current_dim is not None and current_score is not None: + scores.append(ScoreEntry( + name=current_dim, + value=current_score, + max_value=5.0, + rationale=current_rationale.strip(), + )) + + return scores + + +def run_entity_evaluation( + config: InfospaceConfig, + entities: List[EntityMeta], + adapter: LLMAdapter, + run_config: Optional[RunConfig] = None, + output_dir: Optional[Path] = None, + previous_digests: Optional[Dict[str, str]] = None, + progress_callback: Optional[Callable] = None, + dimensions: Optional[List[str]] = None, +) -> BatchSummary: + """Run per-entity evaluation using the batch evaluator. + + Args: + config: The infospace configuration. + entities: Entities to evaluate. + adapter: LLM adapter for evaluation. + run_config: LLM execution configuration. + output_dir: Where to write evaluation results. Defaults to + ``config.evaluations_dir`` relative to CWD. + previous_digests: ``{slug: digest}`` for incremental skip. + progress_callback: Called after each item. + dimensions: Custom evaluation dimensions. + + Returns: + A :class:`BatchSummary` with per-entity results. + """ + topic = config.topic.name + items = [ + BatchItem( + key=entity.slug, + prompt=build_evaluation_prompt(entity, topic, dimensions), + content_digest=content_digest(entity), + metadata={"source_path": entity.source_path}, + ) + for entity in entities + ] + + evaluator = BatchEvaluator( + adapter=adapter, + config=run_config, + progress_callback=progress_callback, + previous_digests=previous_digests, + ) + summary = evaluator.evaluate(items) + + # Write successful results + evaluations_path = output_dir or Path(config.evaluations_dir) + evaluator_name = (run_config.model_name if run_config else "unknown") + + for result in summary.results: + if result.status != "success" or result.response is None: + continue + + scores = parse_evaluation_response(result.response.content, dimensions) + evaluation = EntityEvaluation( + entity_slug=result.key, + evaluator=evaluator_name, + scores=scores, + evaluated_at=datetime.utcnow(), + ) + eval_path = evaluations_path / f"{result.key}.md" + write_entity_evaluation(evaluation, eval_path) + + return summary diff --git a/tests/unit/infospace/test_evaluate.py b/tests/unit/infospace/test_evaluate.py new file mode 100644 index 00000000..c7f66e46 --- /dev/null +++ b/tests/unit/infospace/test_evaluate.py @@ -0,0 +1,224 @@ +"""Tests for markitect.infospace.evaluate.""" + +from datetime import datetime +from pathlib import Path + +import pytest + +from markitect.infospace.config import InfospaceConfig, TopicConfig +from markitect.infospace.evaluate import ( + build_evaluation_prompt, + content_digest, + parse_evaluation_response, + run_entity_evaluation, +) +from markitect.infospace.evaluation import ScoreEntry +from markitect.infospace.models import EntityMeta +from markitect.prompts.execution.llm_adapter import MockLLMAdapter +from markitect.prompts.execution.models import RunConfig + + +# ── Helpers ────────────────────────────────────────────────────────── + + +def _entity(**overrides) -> EntityMeta: + defaults = dict( + slug="division-of-labour", + title="Division Of Labour", + h1_raw="Division Of Labour", + definition="Splitting work into specialised tasks.", + source_chapter="Book I Chapter 1", + context="Smith introduces the concept early.", + domain="Production", + source_path="entities/division-of-labour.md", + ) + defaults.update(overrides) + return EntityMeta(**defaults) + + +def _config() -> InfospaceConfig: + return InfospaceConfig(topic=TopicConfig(name="The Wealth of Nations")) + + +_MOCK_RESPONSE = """\ +DIMENSION: definition_precision +SCORE: 4.5 +RATIONALE: Clear and specific definition of the concept. + +DIMENSION: source_grounding +SCORE: 4.0 +RATIONALE: Well grounded in Smith's text. + +DIMENSION: domain_relevance +SCORE: 5.0 +RATIONALE: Directly relevant to production economics. +""" + + +# ── build_evaluation_prompt ────────────────────────────────────────── + + +class TestBuildPrompt: + def test_contains_entity_fields(self): + entity = _entity() + prompt = build_evaluation_prompt(entity, "Test Topic") + assert "division-of-labour" in prompt + assert "Division Of Labour" in prompt + assert "Production" in prompt + assert "Splitting work" in prompt + + def test_contains_topic(self): + prompt = build_evaluation_prompt(_entity(), "WoN") + assert "WoN" in prompt + + def test_contains_dimensions(self): + prompt = build_evaluation_prompt(_entity(), "T") + assert "definition_precision" in prompt + assert "source_grounding" in prompt + + def test_custom_dimensions(self): + prompt = build_evaluation_prompt( + _entity(), "T", dimensions=["novelty", "coherence"] + ) + assert "novelty" in prompt + assert "coherence" in prompt + assert "definition_precision" not in prompt + + def test_handles_missing_fields(self): + entity = _entity(definition="", context="", domain="") + prompt = build_evaluation_prompt(entity, "T") + assert "(no definition)" in prompt + assert "(no context)" in prompt + assert "(unspecified)" in prompt + + +# ── content_digest ─────────────────────────────────────────────────── + + +class TestContentDigest: + def test_deterministic(self): + e = _entity() + assert content_digest(e) == content_digest(e) + + def test_changes_with_content(self): + e1 = _entity(definition="A") + e2 = _entity(definition="B") + assert content_digest(e1) != content_digest(e2) + + +# ── parse_evaluation_response ──────────────────────────────────────── + + +class TestParseResponse: + def test_parses_three_dimensions(self): + scores = parse_evaluation_response(_MOCK_RESPONSE) + assert len(scores) == 3 + + def test_correct_names(self): + scores = parse_evaluation_response(_MOCK_RESPONSE) + names = [s.name for s in scores] + assert "definition_precision" in names + assert "source_grounding" in names + assert "domain_relevance" in names + + def test_correct_scores(self): + scores = parse_evaluation_response(_MOCK_RESPONSE) + by_name = {s.name: s for s in scores} + assert by_name["definition_precision"].value == 4.5 + assert by_name["source_grounding"].value == 4.0 + assert by_name["domain_relevance"].value == 5.0 + + def test_correct_rationales(self): + scores = parse_evaluation_response(_MOCK_RESPONSE) + by_name = {s.name: s for s in scores} + assert "Clear" in by_name["definition_precision"].rationale + + def test_empty_response(self): + scores = parse_evaluation_response("") + assert scores == [] + + def test_malformed_score_skipped(self): + text = "DIMENSION: x\nSCORE: not-a-number\nRATIONALE: oops" + scores = parse_evaluation_response(text) + assert len(scores) == 0 + + +# ── run_entity_evaluation ──────────────────────────────────────────── + + +class TestRunEntityEvaluation: + def test_evaluates_entities(self, tmp_path): + adapter = MockLLMAdapter(_MOCK_RESPONSE) + cfg = _config() + entities = [_entity(), _entity(slug="pin-factory", title="Pin Factory")] + + summary = run_entity_evaluation( + config=cfg, + entities=entities, + adapter=adapter, + output_dir=tmp_path / "evals", + ) + assert summary.total == 2 + assert summary.succeeded == 2 + assert adapter.call_count == 2 + + def test_writes_evaluation_files(self, tmp_path): + adapter = MockLLMAdapter(_MOCK_RESPONSE) + cfg = _config() + entities = [_entity()] + + run_entity_evaluation( + config=cfg, + entities=entities, + adapter=adapter, + output_dir=tmp_path / "evals", + ) + eval_file = tmp_path / "evals" / "division-of-labour.md" + assert eval_file.exists() + text = eval_file.read_text() + assert "definition_precision" in text + + def test_incremental_skip(self, tmp_path): + adapter = MockLLMAdapter(_MOCK_RESPONSE) + cfg = _config() + entity = _entity() + digest = content_digest(entity) + + summary = run_entity_evaluation( + config=cfg, + entities=[entity], + adapter=adapter, + output_dir=tmp_path, + previous_digests={entity.slug: digest}, + ) + assert summary.skipped == 1 + assert adapter.call_count == 0 + + def test_progress_callback_called(self, tmp_path): + adapter = MockLLMAdapter(_MOCK_RESPONSE) + cfg = _config() + calls = [] + + run_entity_evaluation( + config=cfg, + entities=[_entity()], + adapter=adapter, + output_dir=tmp_path, + progress_callback=lambda d, t, r: calls.append((d, t, r.key)), + ) + assert len(calls) == 1 + assert calls[0] == (1, 1, "division-of-labour") + + def test_passes_run_config(self, tmp_path): + adapter = MockLLMAdapter(_MOCK_RESPONSE) + cfg = _config() + rc = RunConfig(temperature=0.1, max_tokens=500) + + run_entity_evaluation( + config=cfg, + entities=[_entity()], + adapter=adapter, + run_config=rc, + output_dir=tmp_path, + ) + assert adapter.last_config.temperature == 0.1