feat(infospace): add per-entity evaluation pipeline and CLI command (S2.3)

Evaluation pipeline builds prompts from entity metadata, delegates to BatchEvaluator, parses structured LLM responses into ScoreEntry objects, and writes evaluation files. CLI: 'markitect infospace evaluate' with --provider, --entity, --chapter filters. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:48:34 +01:00
parent 3726503adb
commit 3461d2f354
3 changed files with 504 additions and 0 deletions
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -153,6 +153,71 @@ def entities(config_path: Optional[str], sort_key: str):
    click.echo(f"\nTotal: {len(entity_list)} entities")


+# ── evaluate ─────────────────────────────────────────────────────────
+
+
+@infospace_commands.command()
+@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
+@click.option("--provider", default="openrouter", help="LLM provider (openrouter, openai, etc.).")
+@click.option("--model", default=None, help="LLM model name.")
+@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
+@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
+def evaluate(config_path, provider, model, entity_slug, chapter):
+    """Evaluate entities using LLM-based quality assessment."""
+    cfg, cfg_path = _load_config_or_exit(config_path)
+    root = cfg_path.parent
+
+    entities_dir = root / cfg.entities_dir
+    if not entities_dir.is_dir():
+        click.echo("Error: No entities directory found.", err=True)
+        raise SystemExit(1)
+
+    entity_list = parse_entity_directory(entities_dir)
+    if not entity_list:
+        click.echo("No entities to evaluate.")
+        return
+
+    # Filter
+    if entity_slug:
+        entity_list = [e for e in entity_list if e.slug == entity_slug]
+        if not entity_list:
+            click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
+            raise SystemExit(1)
+    elif chapter:
+        entity_list = [e for e in entity_list if chapter in e.source_chapter]
+        if not entity_list:
+            click.echo(f"No entities found for chapter '{chapter}'.")
+            return
+
+    # Create adapter
+    from markitect.llm import create_adapter
+    from markitect.prompts.execution.models import RunConfig
+    adapter = create_adapter(provider, model=model)
+    run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
+
+    # Progress callback
+    def on_progress(done, total, result):
+        status = result.status.upper()
+        click.echo(f"  [{done}/{total}] {result.key}: {status}")
+
+    click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
+
+    from markitect.infospace.evaluate import run_entity_evaluation
+    output_dir = root / cfg.evaluations_dir
+    summary = run_entity_evaluation(
+        config=cfg,
+        entities=entity_list,
+        adapter=adapter,
+        run_config=run_config,
+        output_dir=output_dir,
+        progress_callback=on_progress,
+    )
+
+    click.echo(f"\nDone: {summary.succeeded} succeeded, {summary.failed} failed, {summary.skipped} skipped")
+    if summary.total_tokens > 0:
+        click.echo(f"Tokens used: {summary.total_tokens}")
+
+
 # ── viability ────────────────────────────────────────────────────────


--- a/markitect/infospace/evaluate.py
+++ b/markitect/infospace/evaluate.py
@@ -0,0 +1,215 @@
+"""
+Per-entity evaluation pipeline.
+
+Builds prompts from entity metadata and delegates LLM evaluation to
+the :class:`BatchEvaluator`.  Writes structured results to the
+evaluations directory.
+"""
+
+from __future__ import annotations
+
+import hashlib
+from datetime import datetime
+from pathlib import Path
+from typing import Callable, Dict, List, Optional
+
+from markitect.infospace.config import InfospaceConfig
+from markitect.infospace.evaluation import EntityEvaluation, ScoreEntry
+from markitect.infospace.evaluation_io import write_entity_evaluation
+from markitect.infospace.models import EntityMeta
+from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary
+from markitect.prompts.execution.llm_adapter import LLMAdapter
+from markitect.prompts.execution.models import RunConfig
+
+
+_DEFAULT_DIMENSIONS = [
+    "definition_precision",
+    "source_grounding",
+    "domain_relevance",
+    "discipline_alignment",
+    "conceptual_clarity",
+]
+
+_PROMPT_TEMPLATE = """\
+You are evaluating an entity from an infospace about "{topic}".
+
+## Entity: {title}
+
+**Slug:** {slug}
+**Domain:** {domain}
+**Source chapter:** {source_chapter}
+
+### Definition
+{definition}
+
+### Context
+{context}
+
+## Instructions
+
+Rate this entity on each dimension below using a scale of 1-5 \
+(1 = poor, 5 = excellent). For each dimension, provide:
+1. A numeric score (1-5)
+2. A brief rationale (1-2 sentences)
+
+### Dimensions to evaluate:
+{dimensions_list}
+
+## Output format
+
+Return your evaluation as a structured list:
+
+DIMENSION: <name>
+SCORE: <1-5>
+RATIONALE: <explanation>
+
+Repeat for each dimension.
+"""
+
+
+def build_evaluation_prompt(
+    entity: EntityMeta,
+    topic: str,
+    dimensions: Optional[List[str]] = None,
+) -> str:
+    """Build an evaluation prompt for a single entity."""
+    dims = dimensions or _DEFAULT_DIMENSIONS
+    dims_list = "\n".join(f"- {d}" for d in dims)
+    return _PROMPT_TEMPLATE.format(
+        topic=topic,
+        title=entity.title,
+        slug=entity.slug,
+        domain=entity.domain or "(unspecified)",
+        source_chapter=entity.source_chapter or "(unspecified)",
+        definition=entity.definition or "(no definition)",
+        context=entity.context or "(no context)",
+        dimensions_list=dims_list,
+    )
+
+
+def content_digest(entity: EntityMeta) -> str:
+    """Compute a content digest for incremental evaluation."""
+    content = f"{entity.slug}:{entity.definition}:{entity.context}:{entity.domain}"
+    return hashlib.sha256(content.encode()).hexdigest()[:16]
+
+
+def parse_evaluation_response(
+    response_text: str,
+    dimensions: Optional[List[str]] = None,
+) -> List[ScoreEntry]:
+    """Parse structured dimension scores from LLM response text.
+
+    Expects blocks of::
+
+        DIMENSION: <name>
+        SCORE: <1-5>
+        RATIONALE: <text>
+    """
+    dims = dimensions or _DEFAULT_DIMENSIONS
+    scores: List[ScoreEntry] = []
+    current_dim = None
+    current_score = None
+    current_rationale = ""
+
+    for line in response_text.splitlines():
+        stripped = line.strip()
+        if stripped.upper().startswith("DIMENSION:"):
+            # Flush previous
+            if current_dim is not None and current_score is not None:
+                scores.append(ScoreEntry(
+                    name=current_dim,
+                    value=current_score,
+                    max_value=5.0,
+                    rationale=current_rationale.strip(),
+                ))
+            current_dim = stripped.split(":", 1)[1].strip()
+            current_score = None
+            current_rationale = ""
+        elif stripped.upper().startswith("SCORE:"):
+            try:
+                current_score = float(stripped.split(":", 1)[1].strip())
+            except ValueError:
+                current_score = None
+        elif stripped.upper().startswith("RATIONALE:"):
+            current_rationale = stripped.split(":", 1)[1].strip()
+        elif current_dim is not None and current_score is not None:
+            # Continuation of rationale
+            if stripped:
+                current_rationale += " " + stripped
+
+    # Flush last
+    if current_dim is not None and current_score is not None:
+        scores.append(ScoreEntry(
+            name=current_dim,
+            value=current_score,
+            max_value=5.0,
+            rationale=current_rationale.strip(),
+        ))
+
+    return scores
+
+
+def run_entity_evaluation(
+    config: InfospaceConfig,
+    entities: List[EntityMeta],
+    adapter: LLMAdapter,
+    run_config: Optional[RunConfig] = None,
+    output_dir: Optional[Path] = None,
+    previous_digests: Optional[Dict[str, str]] = None,
+    progress_callback: Optional[Callable] = None,
+    dimensions: Optional[List[str]] = None,
+) -> BatchSummary:
+    """Run per-entity evaluation using the batch evaluator.
+
+    Args:
+        config: The infospace configuration.
+        entities: Entities to evaluate.
+        adapter: LLM adapter for evaluation.
+        run_config: LLM execution configuration.
+        output_dir: Where to write evaluation results.  Defaults to
+            ``config.evaluations_dir`` relative to CWD.
+        previous_digests: ``{slug: digest}`` for incremental skip.
+        progress_callback: Called after each item.
+        dimensions: Custom evaluation dimensions.
+
+    Returns:
+        A :class:`BatchSummary` with per-entity results.
+    """
+    topic = config.topic.name
+    items = [
+        BatchItem(
+            key=entity.slug,
+            prompt=build_evaluation_prompt(entity, topic, dimensions),
+            content_digest=content_digest(entity),
+            metadata={"source_path": entity.source_path},
+        )
+        for entity in entities
+    ]
+
+    evaluator = BatchEvaluator(
+        adapter=adapter,
+        config=run_config,
+        progress_callback=progress_callback,
+        previous_digests=previous_digests,
+    )
+    summary = evaluator.evaluate(items)
+
+    # Write successful results
+    evaluations_path = output_dir or Path(config.evaluations_dir)
+    evaluator_name = (run_config.model_name if run_config else "unknown")
+
+    for result in summary.results:
+        if result.status != "success" or result.response is None:
+            continue
+
+        scores = parse_evaluation_response(result.response.content, dimensions)
+        evaluation = EntityEvaluation(
+            entity_slug=result.key,
+            evaluator=evaluator_name,
+            scores=scores,
+            evaluated_at=datetime.utcnow(),
+        )
+        eval_path = evaluations_path / f"{result.key}.md"
+        write_entity_evaluation(evaluation, eval_path)
+
+    return summary