feat(infospace): add per-entity evaluation pipeline and CLI command (S2.3)

Evaluation pipeline builds prompts from entity metadata, delegates
to BatchEvaluator, parses structured LLM responses into ScoreEntry
objects, and writes evaluation files. CLI: 'markitect infospace evaluate'
with --provider, --entity, --chapter filters.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-19 01:48:34 +01:00
parent 3726503adb
commit 3461d2f354
3 changed files with 504 additions and 0 deletions

View File

@@ -153,6 +153,71 @@ def entities(config_path: Optional[str], sort_key: str):
click.echo(f"\nTotal: {len(entity_list)} entities")
# ── evaluate ─────────────────────────────────────────────────────────
@infospace_commands.command()
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--provider", default="openrouter", help="LLM provider (openrouter, openai, etc.).")
@click.option("--model", default=None, help="LLM model name.")
@click.option("--entity", "entity_slug", default=None, help="Evaluate a single entity by slug.")
@click.option("--chapter", default=None, help="Evaluate entities from a specific chapter.")
def evaluate(config_path, provider, model, entity_slug, chapter):
"""Evaluate entities using LLM-based quality assessment."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
entities_dir = root / cfg.entities_dir
if not entities_dir.is_dir():
click.echo("Error: No entities directory found.", err=True)
raise SystemExit(1)
entity_list = parse_entity_directory(entities_dir)
if not entity_list:
click.echo("No entities to evaluate.")
return
# Filter
if entity_slug:
entity_list = [e for e in entity_list if e.slug == entity_slug]
if not entity_list:
click.echo(f"Error: Entity '{entity_slug}' not found.", err=True)
raise SystemExit(1)
elif chapter:
entity_list = [e for e in entity_list if chapter in e.source_chapter]
if not entity_list:
click.echo(f"No entities found for chapter '{chapter}'.")
return
# Create adapter
from markitect.llm import create_adapter
from markitect.prompts.execution.models import RunConfig
adapter = create_adapter(provider, model=model)
run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
# Progress callback
def on_progress(done, total, result):
status = result.status.upper()
click.echo(f" [{done}/{total}] {result.key}: {status}")
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
from markitect.infospace.evaluate import run_entity_evaluation
output_dir = root / cfg.evaluations_dir
summary = run_entity_evaluation(
config=cfg,
entities=entity_list,
adapter=adapter,
run_config=run_config,
output_dir=output_dir,
progress_callback=on_progress,
)
click.echo(f"\nDone: {summary.succeeded} succeeded, {summary.failed} failed, {summary.skipped} skipped")
if summary.total_tokens > 0:
click.echo(f"Tokens used: {summary.total_tokens}")
# ── viability ────────────────────────────────────────────────────────

View File

@@ -0,0 +1,215 @@
"""
Per-entity evaluation pipeline.
Builds prompts from entity metadata and delegates LLM evaluation to
the :class:`BatchEvaluator`. Writes structured results to the
evaluations directory.
"""
from __future__ import annotations
import hashlib
from datetime import datetime
from pathlib import Path
from typing import Callable, Dict, List, Optional
from markitect.infospace.config import InfospaceConfig
from markitect.infospace.evaluation import EntityEvaluation, ScoreEntry
from markitect.infospace.evaluation_io import write_entity_evaluation
from markitect.infospace.models import EntityMeta
from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary
from markitect.prompts.execution.llm_adapter import LLMAdapter
from markitect.prompts.execution.models import RunConfig
_DEFAULT_DIMENSIONS = [
"definition_precision",
"source_grounding",
"domain_relevance",
"discipline_alignment",
"conceptual_clarity",
]
_PROMPT_TEMPLATE = """\
You are evaluating an entity from an infospace about "{topic}".
## Entity: {title}
**Slug:** {slug}
**Domain:** {domain}
**Source chapter:** {source_chapter}
### Definition
{definition}
### Context
{context}
## Instructions
Rate this entity on each dimension below using a scale of 1-5 \
(1 = poor, 5 = excellent). For each dimension, provide:
1. A numeric score (1-5)
2. A brief rationale (1-2 sentences)
### Dimensions to evaluate:
{dimensions_list}
## Output format
Return your evaluation as a structured list:
DIMENSION: <name>
SCORE: <1-5>
RATIONALE: <explanation>
Repeat for each dimension.
"""
def build_evaluation_prompt(
entity: EntityMeta,
topic: str,
dimensions: Optional[List[str]] = None,
) -> str:
"""Build an evaluation prompt for a single entity."""
dims = dimensions or _DEFAULT_DIMENSIONS
dims_list = "\n".join(f"- {d}" for d in dims)
return _PROMPT_TEMPLATE.format(
topic=topic,
title=entity.title,
slug=entity.slug,
domain=entity.domain or "(unspecified)",
source_chapter=entity.source_chapter or "(unspecified)",
definition=entity.definition or "(no definition)",
context=entity.context or "(no context)",
dimensions_list=dims_list,
)
def content_digest(entity: EntityMeta) -> str:
"""Compute a content digest for incremental evaluation."""
content = f"{entity.slug}:{entity.definition}:{entity.context}:{entity.domain}"
return hashlib.sha256(content.encode()).hexdigest()[:16]
def parse_evaluation_response(
response_text: str,
dimensions: Optional[List[str]] = None,
) -> List[ScoreEntry]:
"""Parse structured dimension scores from LLM response text.
Expects blocks of::
DIMENSION: <name>
SCORE: <1-5>
RATIONALE: <text>
"""
dims = dimensions or _DEFAULT_DIMENSIONS
scores: List[ScoreEntry] = []
current_dim = None
current_score = None
current_rationale = ""
for line in response_text.splitlines():
stripped = line.strip()
if stripped.upper().startswith("DIMENSION:"):
# Flush previous
if current_dim is not None and current_score is not None:
scores.append(ScoreEntry(
name=current_dim,
value=current_score,
max_value=5.0,
rationale=current_rationale.strip(),
))
current_dim = stripped.split(":", 1)[1].strip()
current_score = None
current_rationale = ""
elif stripped.upper().startswith("SCORE:"):
try:
current_score = float(stripped.split(":", 1)[1].strip())
except ValueError:
current_score = None
elif stripped.upper().startswith("RATIONALE:"):
current_rationale = stripped.split(":", 1)[1].strip()
elif current_dim is not None and current_score is not None:
# Continuation of rationale
if stripped:
current_rationale += " " + stripped
# Flush last
if current_dim is not None and current_score is not None:
scores.append(ScoreEntry(
name=current_dim,
value=current_score,
max_value=5.0,
rationale=current_rationale.strip(),
))
return scores
def run_entity_evaluation(
config: InfospaceConfig,
entities: List[EntityMeta],
adapter: LLMAdapter,
run_config: Optional[RunConfig] = None,
output_dir: Optional[Path] = None,
previous_digests: Optional[Dict[str, str]] = None,
progress_callback: Optional[Callable] = None,
dimensions: Optional[List[str]] = None,
) -> BatchSummary:
"""Run per-entity evaluation using the batch evaluator.
Args:
config: The infospace configuration.
entities: Entities to evaluate.
adapter: LLM adapter for evaluation.
run_config: LLM execution configuration.
output_dir: Where to write evaluation results. Defaults to
``config.evaluations_dir`` relative to CWD.
previous_digests: ``{slug: digest}`` for incremental skip.
progress_callback: Called after each item.
dimensions: Custom evaluation dimensions.
Returns:
A :class:`BatchSummary` with per-entity results.
"""
topic = config.topic.name
items = [
BatchItem(
key=entity.slug,
prompt=build_evaluation_prompt(entity, topic, dimensions),
content_digest=content_digest(entity),
metadata={"source_path": entity.source_path},
)
for entity in entities
]
evaluator = BatchEvaluator(
adapter=adapter,
config=run_config,
progress_callback=progress_callback,
previous_digests=previous_digests,
)
summary = evaluator.evaluate(items)
# Write successful results
evaluations_path = output_dir or Path(config.evaluations_dir)
evaluator_name = (run_config.model_name if run_config else "unknown")
for result in summary.results:
if result.status != "success" or result.response is None:
continue
scores = parse_evaluation_response(result.response.content, dimensions)
evaluation = EntityEvaluation(
entity_slug=result.key,
evaluator=evaluator_name,
scores=scores,
evaluated_at=datetime.utcnow(),
)
eval_path = evaluations_path / f"{result.key}.md"
write_entity_evaluation(evaluation, eval_path)
return summary

View File

@@ -0,0 +1,224 @@
"""Tests for markitect.infospace.evaluate."""
from datetime import datetime
from pathlib import Path
import pytest
from markitect.infospace.config import InfospaceConfig, TopicConfig
from markitect.infospace.evaluate import (
build_evaluation_prompt,
content_digest,
parse_evaluation_response,
run_entity_evaluation,
)
from markitect.infospace.evaluation import ScoreEntry
from markitect.infospace.models import EntityMeta
from markitect.prompts.execution.llm_adapter import MockLLMAdapter
from markitect.prompts.execution.models import RunConfig
# ── Helpers ──────────────────────────────────────────────────────────
def _entity(**overrides) -> EntityMeta:
defaults = dict(
slug="division-of-labour",
title="Division Of Labour",
h1_raw="Division Of Labour",
definition="Splitting work into specialised tasks.",
source_chapter="Book I Chapter 1",
context="Smith introduces the concept early.",
domain="Production",
source_path="entities/division-of-labour.md",
)
defaults.update(overrides)
return EntityMeta(**defaults)
def _config() -> InfospaceConfig:
return InfospaceConfig(topic=TopicConfig(name="The Wealth of Nations"))
_MOCK_RESPONSE = """\
DIMENSION: definition_precision
SCORE: 4.5
RATIONALE: Clear and specific definition of the concept.
DIMENSION: source_grounding
SCORE: 4.0
RATIONALE: Well grounded in Smith's text.
DIMENSION: domain_relevance
SCORE: 5.0
RATIONALE: Directly relevant to production economics.
"""
# ── build_evaluation_prompt ──────────────────────────────────────────
class TestBuildPrompt:
def test_contains_entity_fields(self):
entity = _entity()
prompt = build_evaluation_prompt(entity, "Test Topic")
assert "division-of-labour" in prompt
assert "Division Of Labour" in prompt
assert "Production" in prompt
assert "Splitting work" in prompt
def test_contains_topic(self):
prompt = build_evaluation_prompt(_entity(), "WoN")
assert "WoN" in prompt
def test_contains_dimensions(self):
prompt = build_evaluation_prompt(_entity(), "T")
assert "definition_precision" in prompt
assert "source_grounding" in prompt
def test_custom_dimensions(self):
prompt = build_evaluation_prompt(
_entity(), "T", dimensions=["novelty", "coherence"]
)
assert "novelty" in prompt
assert "coherence" in prompt
assert "definition_precision" not in prompt
def test_handles_missing_fields(self):
entity = _entity(definition="", context="", domain="")
prompt = build_evaluation_prompt(entity, "T")
assert "(no definition)" in prompt
assert "(no context)" in prompt
assert "(unspecified)" in prompt
# ── content_digest ───────────────────────────────────────────────────
class TestContentDigest:
def test_deterministic(self):
e = _entity()
assert content_digest(e) == content_digest(e)
def test_changes_with_content(self):
e1 = _entity(definition="A")
e2 = _entity(definition="B")
assert content_digest(e1) != content_digest(e2)
# ── parse_evaluation_response ────────────────────────────────────────
class TestParseResponse:
def test_parses_three_dimensions(self):
scores = parse_evaluation_response(_MOCK_RESPONSE)
assert len(scores) == 3
def test_correct_names(self):
scores = parse_evaluation_response(_MOCK_RESPONSE)
names = [s.name for s in scores]
assert "definition_precision" in names
assert "source_grounding" in names
assert "domain_relevance" in names
def test_correct_scores(self):
scores = parse_evaluation_response(_MOCK_RESPONSE)
by_name = {s.name: s for s in scores}
assert by_name["definition_precision"].value == 4.5
assert by_name["source_grounding"].value == 4.0
assert by_name["domain_relevance"].value == 5.0
def test_correct_rationales(self):
scores = parse_evaluation_response(_MOCK_RESPONSE)
by_name = {s.name: s for s in scores}
assert "Clear" in by_name["definition_precision"].rationale
def test_empty_response(self):
scores = parse_evaluation_response("")
assert scores == []
def test_malformed_score_skipped(self):
text = "DIMENSION: x\nSCORE: not-a-number\nRATIONALE: oops"
scores = parse_evaluation_response(text)
assert len(scores) == 0
# ── run_entity_evaluation ────────────────────────────────────────────
class TestRunEntityEvaluation:
def test_evaluates_entities(self, tmp_path):
adapter = MockLLMAdapter(_MOCK_RESPONSE)
cfg = _config()
entities = [_entity(), _entity(slug="pin-factory", title="Pin Factory")]
summary = run_entity_evaluation(
config=cfg,
entities=entities,
adapter=adapter,
output_dir=tmp_path / "evals",
)
assert summary.total == 2
assert summary.succeeded == 2
assert adapter.call_count == 2
def test_writes_evaluation_files(self, tmp_path):
adapter = MockLLMAdapter(_MOCK_RESPONSE)
cfg = _config()
entities = [_entity()]
run_entity_evaluation(
config=cfg,
entities=entities,
adapter=adapter,
output_dir=tmp_path / "evals",
)
eval_file = tmp_path / "evals" / "division-of-labour.md"
assert eval_file.exists()
text = eval_file.read_text()
assert "definition_precision" in text
def test_incremental_skip(self, tmp_path):
adapter = MockLLMAdapter(_MOCK_RESPONSE)
cfg = _config()
entity = _entity()
digest = content_digest(entity)
summary = run_entity_evaluation(
config=cfg,
entities=[entity],
adapter=adapter,
output_dir=tmp_path,
previous_digests={entity.slug: digest},
)
assert summary.skipped == 1
assert adapter.call_count == 0
def test_progress_callback_called(self, tmp_path):
adapter = MockLLMAdapter(_MOCK_RESPONSE)
cfg = _config()
calls = []
run_entity_evaluation(
config=cfg,
entities=[_entity()],
adapter=adapter,
output_dir=tmp_path,
progress_callback=lambda d, t, r: calls.append((d, t, r.key)),
)
assert len(calls) == 1
assert calls[0] == (1, 1, "division-of-labour")
def test_passes_run_config(self, tmp_path):
adapter = MockLLMAdapter(_MOCK_RESPONSE)
cfg = _config()
rc = RunConfig(temperature=0.1, max_tokens=500)
run_entity_evaluation(
config=cfg,
entities=[_entity()],
adapter=adapter,
run_config=rc,
output_dir=tmp_path,
)
assert adapter.last_config.temperature == 0.1