feat(infospace): add per-entity evaluation pipeline and CLI command (S2.3)

Evaluation pipeline builds prompts from entity metadata, delegates to BatchEvaluator, parses structured LLM responses into ScoreEntry objects, and writes evaluation files. CLI: 'markitect infospace evaluate' with --provider, --entity, --chapter filters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:48:34 +01:00
parent 3726503adb
commit 3461d2f354
3 changed files with 504 additions and 0 deletions
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -153,6 +153,71 @@ def entities(config_path: Optional[str], sort_key: str):
    click.echo(f"\nTotal: {len(entity_list)} entities")


+# ── evaluate ─────────────────────────────────────────────────────────
+
+
+@infospace_commands.command()
+@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
+@click.option("--provider", default="openrouter", help="LLM provider (openrouter, openai, etc.).")
+@click.option("--model", default=None, help="LLM model name.")
+@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
+@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
+def evaluate(config_path, provider, model, entity_slug, chapter):
+    """Evaluate entities using LLM-based quality assessment."""
+    cfg, cfg_path = _load_config_or_exit(config_path)
+    root = cfg_path.parent
+
+    entities_dir = root / cfg.entities_dir
+    if not entities_dir.is_dir():
+        click.echo("Error: No entities directory found.", err=True)
+        raise SystemExit(1)
+
+    entity_list = parse_entity_directory(entities_dir)
+    if not entity_list:
+        click.echo("No entities to evaluate.")
+        return
+
+    # Filter
+    if entity_slug:
+        entity_list = [e for e in entity_list if e.slug == entity_slug]
+        if not entity_list:
+            click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
+            raise SystemExit(1)
+    elif chapter:
+        entity_list = [e for e in entity_list if chapter in e.source_chapter]
+        if not entity_list:
+            click.echo(f"No entities found for chapter '{chapter}'.")
+            return
+
+    # Create adapter
+    from markitect.llm import create_adapter
+    from markitect.prompts.execution.models import RunConfig
+    adapter = create_adapter(provider, model=model)
+    run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
+
+    # Progress callback
+    def on_progress(done, total, result):
+        status = result.status.upper()
+        click.echo(f"  [{done}/{total}] {result.key}: {status}")
+
+    click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
+
+    from markitect.infospace.evaluate import run_entity_evaluation
+    output_dir = root / cfg.evaluations_dir
+    summary = run_entity_evaluation(
+        config=cfg,
+        entities=entity_list,
+        adapter=adapter,
+        run_config=run_config,
+        output_dir=output_dir,
+        progress_callback=on_progress,
+    )
+
+    click.echo(f"\nDone: {summary.succeeded} succeeded, {summary.failed} failed, {summary.skipped} skipped")
+    if summary.total_tokens > 0:
+        click.echo(f"Tokens used: {summary.total_tokens}")
+
+
 # ── viability ────────────────────────────────────────────────────────


--- a/markitect/infospace/evaluate.py
+++ b/markitect/infospace/evaluate.py
@@ -0,0 +1,215 @@
+"""
+Per-entity evaluation pipeline.
+
+Builds prompts from entity metadata and delegates LLM evaluation to
+the :class:`BatchEvaluator`.  Writes structured results to the
+evaluations directory.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from datetime import datetime
+from pathlib import Path
+from typing import Callable, Dict, List, Optional
+
+from markitect.infospace.config import InfospaceConfig
+from markitect.infospace.evaluation import EntityEvaluation, ScoreEntry
+from markitect.infospace.evaluation_io import write_entity_evaluation
+from markitect.infospace.models import EntityMeta
+from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary
+from markitect.prompts.execution.llm_adapter import LLMAdapter
+from markitect.prompts.execution.models import RunConfig
+
+
+_DEFAULT_DIMENSIONS = [
+    "definition_precision",
+    "source_grounding",
+    "domain_relevance",
+    "discipline_alignment",
+    "conceptual_clarity",
+]
+
+_PROMPT_TEMPLATE = """\
+You are evaluating an entity from an infospace about "{topic}".
+
+## Entity: {title}
+
+**Slug:** {slug}
+**Domain:** {domain}
+**Source chapter:** {source_chapter}
+
+### Definition
+{definition}
+
+### Context
+{context}
+
+## Instructions
+
+Rate this entity on each dimension below using a scale of 1-5 \
+(1 = poor, 5 = excellent). For each dimension, provide:
+1. A numeric score (1-5)
+2. A brief rationale (1-2 sentences)
+
+### Dimensions to evaluate:
+{dimensions_list}
+
+## Output format
+
+Return your evaluation as a structured list:
+
+DIMENSION: <name>
+SCORE: <1-5>
+RATIONALE: <explanation>
+
+Repeat for each dimension.
+"""
+
+
+def build_evaluation_prompt(
+    entity: EntityMeta,
+    topic: str,
+    dimensions: Optional[List[str]] = None,
+) -> str:
+    """Build an evaluation prompt for a single entity."""
+    dims = dimensions or _DEFAULT_DIMENSIONS
+    dims_list = "\n".join(f"- {d}" for d in dims)
+    return _PROMPT_TEMPLATE.format(
+        topic=topic,
+        title=entity.title,
+        slug=entity.slug,
+        domain=entity.domain or "(unspecified)",
+        source_chapter=entity.source_chapter or "(unspecified)",
+        definition=entity.definition or "(no definition)",
+        context=entity.context or "(no context)",
+        dimensions_list=dims_list,
+    )
+
+
+def content_digest(entity: EntityMeta) -> str:
+    """Compute a content digest for incremental evaluation."""
+    content = f"{entity.slug}:{entity.definition}:{entity.context}:{entity.domain}"
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
+
+
+def parse_evaluation_response(
+    response_text: str,
+    dimensions: Optional[List[str]] = None,
+) -> List[ScoreEntry]:
+    """Parse structured dimension scores from LLM response text.
+
+    Expects blocks of::
+
+        DIMENSION: <name>
+        SCORE: <1-5>
+        RATIONALE: <text>
+    """
+    dims = dimensions or _DEFAULT_DIMENSIONS
+    scores: List[ScoreEntry] = []
+    current_dim = None
+    current_score = None
+    current_rationale = ""
+
+    for line in response_text.splitlines():
+        stripped = line.strip()
+        if stripped.upper().startswith("DIMENSION:"):
+            # Flush previous
+            if current_dim is not None and current_score is not None:
+                scores.append(ScoreEntry(
+                    name=current_dim,
+                    value=current_score,
+                    max_value=5.0,
+                    rationale=current_rationale.strip(),
+                ))
+            current_dim = stripped.split(":", 1)[1].strip()
+            current_score = None
+            current_rationale = ""
+        elif stripped.upper().startswith("SCORE:"):
+            try:
+                current_score = float(stripped.split(":", 1)[1].strip())
+            except ValueError:
+                current_score = None
+        elif stripped.upper().startswith("RATIONALE:"):
+            current_rationale = stripped.split(":", 1)[1].strip()
+        elif current_dim is not None and current_score is not None:
+            # Continuation of rationale
+            if stripped:
+                current_rationale += " " + stripped
+
+    # Flush last
+    if current_dim is not None and current_score is not None:
+        scores.append(ScoreEntry(
+            name=current_dim,
+            value=current_score,
+            max_value=5.0,
+            rationale=current_rationale.strip(),
+        ))
+
+    return scores
+
+
+def run_entity_evaluation(
+    config: InfospaceConfig,
+    entities: List[EntityMeta],
+    adapter: LLMAdapter,
+    run_config: Optional[RunConfig] = None,
+    output_dir: Optional[Path] = None,
+    previous_digests: Optional[Dict[str, str]] = None,
+    progress_callback: Optional[Callable] = None,
+    dimensions: Optional[List[str]] = None,
+) -> BatchSummary:
+    """Run per-entity evaluation using the batch evaluator.
+
+    Args:
+        config: The infospace configuration.
+        entities: Entities to evaluate.
+        adapter: LLM adapter for evaluation.
+        run_config: LLM execution configuration.
+        output_dir: Where to write evaluation results.  Defaults to
+            ``config.evaluations_dir`` relative to CWD.
+        previous_digests: ``{slug: digest}`` for incremental skip.
+        progress_callback: Called after each item.
+        dimensions: Custom evaluation dimensions.
+
+    Returns:
+        A :class:`BatchSummary` with per-entity results.
+    """
+    topic = config.topic.name
+    items = [
+        BatchItem(
+            key=entity.slug,
+            prompt=build_evaluation_prompt(entity, topic, dimensions),
+            content_digest=content_digest(entity),
+            metadata={"source_path": entity.source_path},
+        )
+        for entity in entities
+    ]
+
+    evaluator = BatchEvaluator(
+        adapter=adapter,
+        config=run_config,
+        progress_callback=progress_callback,
+        previous_digests=previous_digests,
+    )
+    summary = evaluator.evaluate(items)
+
+    # Write successful results
+    evaluations_path = output_dir or Path(config.evaluations_dir)
+    evaluator_name = (run_config.model_name if run_config else "unknown")
+
+    for result in summary.results:
+        if result.status != "success" or result.response is None:
+            continue
+
+        scores = parse_evaluation_response(result.response.content, dimensions)
+        evaluation = EntityEvaluation(
+            entity_slug=result.key,
+            evaluator=evaluator_name,
+            scores=scores,
+            evaluated_at=datetime.utcnow(),
+        )
+        eval_path = evaluations_path / f"{result.key}.md"
+        write_entity_evaluation(evaluation, eval_path)
+
+    return summary
--- a/tests/unit/infospace/test_evaluate.py
+++ b/tests/unit/infospace/test_evaluate.py
@@ -0,0 +1,224 @@
+"""Tests for markitect.infospace.evaluate."""
+
+from datetime import datetime
+from pathlib import Path
+
+import pytest
+
+from markitect.infospace.config import InfospaceConfig, TopicConfig
+from markitect.infospace.evaluate import (
+    build_evaluation_prompt,
+    content_digest,
+    parse_evaluation_response,
+    run_entity_evaluation,
+)
+from markitect.infospace.evaluation import ScoreEntry
+from markitect.infospace.models import EntityMeta
+from markitect.prompts.execution.llm_adapter import MockLLMAdapter
+from markitect.prompts.execution.models import RunConfig
+
+
+# ── Helpers ──────────────────────────────────────────────────────────
+
+
+def _entity(**overrides) -> EntityMeta:
+    defaults = dict(
+        slug="division-of-labour",
+        title="Division Of Labour",
+        h1_raw="Division Of Labour",
+        definition="Splitting work into specialised tasks.",
+        source_chapter="Book I Chapter 1",
+        context="Smith introduces the concept early.",
+        domain="Production",
+        source_path="entities/division-of-labour.md",
+    )
+    defaults.update(overrides)
+    return EntityMeta(**defaults)
+
+
+def _config() -> InfospaceConfig:
+    return InfospaceConfig(topic=TopicConfig(name="The Wealth of Nations"))
+
+
+_MOCK_RESPONSE = """\
+DIMENSION: definition_precision
+SCORE: 4.5
+RATIONALE: Clear and specific definition of the concept.
+
+DIMENSION: source_grounding
+SCORE: 4.0
+RATIONALE: Well grounded in Smith's text.
+
+DIMENSION: domain_relevance
+SCORE: 5.0
+RATIONALE: Directly relevant to production economics.
+"""
+
+
+# ── build_evaluation_prompt ──────────────────────────────────────────
+
+
+class TestBuildPrompt:
+    def test_contains_entity_fields(self):
+        entity = _entity()
+        prompt = build_evaluation_prompt(entity, "Test Topic")
+        assert "division-of-labour" in prompt
+        assert "Division Of Labour" in prompt
+        assert "Production" in prompt
+        assert "Splitting work" in prompt
+
+    def test_contains_topic(self):
+        prompt = build_evaluation_prompt(_entity(), "WoN")
+        assert "WoN" in prompt
+
+    def test_contains_dimensions(self):
+        prompt = build_evaluation_prompt(_entity(), "T")
+        assert "definition_precision" in prompt
+        assert "source_grounding" in prompt
+
+    def test_custom_dimensions(self):
+        prompt = build_evaluation_prompt(
+            _entity(), "T", dimensions=["novelty", "coherence"]
+        )
+        assert "novelty" in prompt
+        assert "coherence" in prompt
+        assert "definition_precision" not in prompt
+
+    def test_handles_missing_fields(self):
+        entity = _entity(definition="", context="", domain="")
+        prompt = build_evaluation_prompt(entity, "T")
+        assert "(no definition)" in prompt
+        assert "(no context)" in prompt
+        assert "(unspecified)" in prompt
+
+
+# ── content_digest ───────────────────────────────────────────────────
+
+
+class TestContentDigest:
+    def test_deterministic(self):
+        e = _entity()
+        assert content_digest(e) == content_digest(e)
+
+    def test_changes_with_content(self):
+        e1 = _entity(definition="A")
+        e2 = _entity(definition="B")
+        assert content_digest(e1) != content_digest(e2)
+
+
+# ── parse_evaluation_response ────────────────────────────────────────
+
+
+class TestParseResponse:
+    def test_parses_three_dimensions(self):
+        scores = parse_evaluation_response(_MOCK_RESPONSE)
+        assert len(scores) == 3
+
+    def test_correct_names(self):
+        scores = parse_evaluation_response(_MOCK_RESPONSE)
+        names = [s.name for s in scores]
+        assert "definition_precision" in names
+        assert "source_grounding" in names
+        assert "domain_relevance" in names
+
+    def test_correct_scores(self):
+        scores = parse_evaluation_response(_MOCK_RESPONSE)
+        by_name = {s.name: s for s in scores}
+        assert by_name["definition_precision"].value == 4.5
+        assert by_name["source_grounding"].value == 4.0
+        assert by_name["domain_relevance"].value == 5.0
+
+    def test_correct_rationales(self):
+        scores = parse_evaluation_response(_MOCK_RESPONSE)
+        by_name = {s.name: s for s in scores}
+        assert "Clear" in by_name["definition_precision"].rationale
+
+    def test_empty_response(self):
+        scores = parse_evaluation_response("")
+        assert scores == []
+
+    def test_malformed_score_skipped(self):
+        text = "DIMENSION: x\nSCORE: not-a-number\nRATIONALE: oops"
+        scores = parse_evaluation_response(text)
+        assert len(scores) == 0
+
+
+# ── run_entity_evaluation ────────────────────────────────────────────
+
+
+class TestRunEntityEvaluation:
+    def test_evaluates_entities(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        entities = [_entity(), _entity(slug="pin-factory", title="Pin Factory")]
+
+        summary = run_entity_evaluation(
+            config=cfg,
+            entities=entities,
+            adapter=adapter,
+            output_dir=tmp_path / "evals",
+        )
+        assert summary.total == 2
+        assert summary.succeeded == 2
+        assert adapter.call_count == 2
+
+    def test_writes_evaluation_files(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        entities = [_entity()]
+
+        run_entity_evaluation(
+            config=cfg,
+            entities=entities,
+            adapter=adapter,
+            output_dir=tmp_path / "evals",
+        )
+        eval_file = tmp_path / "evals" / "division-of-labour.md"
+        assert eval_file.exists()
+        text = eval_file.read_text()
+        assert "definition_precision" in text
+
+    def test_incremental_skip(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        entity = _entity()
+        digest = content_digest(entity)
+
+        summary = run_entity_evaluation(
+            config=cfg,
+            entities=[entity],
+            adapter=adapter,
+            output_dir=tmp_path,
+            previous_digests={entity.slug: digest},
+        )
+        assert summary.skipped == 1
+        assert adapter.call_count == 0
+
+    def test_progress_callback_called(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        calls = []
+
+        run_entity_evaluation(
+            config=cfg,
+            entities=[_entity()],
+            adapter=adapter,
+            output_dir=tmp_path,
+            progress_callback=lambda d, t, r: calls.append((d, t, r.key)),
+        )
+        assert len(calls) == 1
+        assert calls[0] == (1, 1, "division-of-labour")
+
+    def test_passes_run_config(self, tmp_path):
+        adapter = MockLLMAdapter(_MOCK_RESPONSE)
+        cfg = _config()
+        rc = RunConfig(temperature=0.1, max_tokens=500)
+
+        run_entity_evaluation(
+            config=cfg,
+            entities=[_entity()],
+            adapter=adapter,
+            run_config=rc,
+            output_dir=tmp_path,
+        )
+        assert adapter.last_config.temperature == 0.1