- Fix evaluate dimensions to match template file: definition_precision, source_grounding, domain_placement, vsm_relevance, explanatory_value (was domain_relevance, discipline_alignment, conceptual_clarity) - Add VSM background context to evaluation prompt so LLM can score vsm_relevance without macro injection - Fix model_name bug: was sending literal "default" to API (HTTP 400) - Refactor run_entity_evaluation to write files incrementally via callback rather than all at once after the batch — long runs are now resumable if interrupted - Add incremental skip in CLI: entities with existing eval files are skipped automatically on re-run (acts as resume) - Add eval-summary command: reads all eval files, shows per-dimension means, optionally writes per_entity_mean to metrics.yaml - Fix record_check_results to merge rather than overwrite metrics.yaml so per_entity_mean survives subsequent check runs - Add per_entity_mean viability threshold (min: 3.5) to infospace.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
233 lines
7.6 KiB
Python
233 lines
7.6 KiB
Python
"""
|
||
Per-entity evaluation pipeline.
|
||
|
||
Builds prompts from entity metadata and delegates LLM evaluation to
|
||
the :class:`BatchEvaluator`. Writes structured results to the
|
||
evaluations directory.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import hashlib
|
||
from datetime import datetime
|
||
from pathlib import Path
|
||
from typing import Callable, Dict, List, Optional
|
||
|
||
from markitect.infospace.config import InfospaceConfig
|
||
from markitect.infospace.evaluation import EntityEvaluation, ScoreEntry
|
||
from markitect.infospace.evaluation_io import write_entity_evaluation
|
||
from markitect.infospace.models import EntityMeta
|
||
from markitect.prompts.execution.batch import BatchEvaluator, BatchItem, BatchSummary
|
||
from markitect.prompts.execution.llm_adapter import LLMAdapter
|
||
from markitect.prompts.execution.models import RunConfig
|
||
|
||
|
||
_DEFAULT_DIMENSIONS = [
|
||
"definition_precision",
|
||
"source_grounding",
|
||
"domain_placement",
|
||
"vsm_relevance",
|
||
"explanatory_value",
|
||
]
|
||
|
||
_PROMPT_TEMPLATE = """\
|
||
You are evaluating an entity from an infospace about "{topic}".
|
||
|
||
## Entity: {title}
|
||
|
||
**Slug:** {slug}
|
||
**Domain:** {domain}
|
||
**Source chapter:** {source_chapter}
|
||
|
||
### Definition
|
||
{definition}
|
||
|
||
### Context
|
||
{context}
|
||
|
||
## Background
|
||
|
||
This infospace maps concepts from the source corpus to Stafford Beer's
|
||
Viable System Model (VSM). The VSM has five systems: S1 (primary operations),
|
||
S2 (coordination/anti-oscillation), S3 (internal regulation/audit),
|
||
S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this
|
||
to assess whether the entity has a natural VSM home.
|
||
|
||
## Dimensions
|
||
|
||
- **definition_precision**: Is the definition precise and non-circular? \
|
||
Does it capture a distinct concept rather than a vague umbrella term?
|
||
- **source_grounding**: Is this entity grounded in the actual source text, \
|
||
or does it introduce concepts the source does not clearly state?
|
||
- **domain_placement**: Is the economic/thematic domain assignment correct? \
|
||
Does the entity belong in a different conceptual category?
|
||
- **vsm_relevance**: Does this entity map naturally to one or more VSM \
|
||
systems (S1–S5), or is it VSM-neutral/too abstract to place?
|
||
- **explanatory_value**: Does this entity add genuine explanatory power — \
|
||
illuminating a mechanism or structural relation — or does it merely name \
|
||
a surface phenomenon?
|
||
|
||
## Instructions
|
||
|
||
Rate each dimension 1–5 (1 = poor, 5 = excellent). Provide a brief
|
||
rationale (1–2 sentences) for each score.
|
||
|
||
## Output format
|
||
|
||
DIMENSION: <name>
|
||
SCORE: <1-5>
|
||
RATIONALE: <explanation>
|
||
|
||
Repeat for each dimension.
|
||
"""
|
||
|
||
|
||
def build_evaluation_prompt(
|
||
entity: EntityMeta,
|
||
topic: str,
|
||
dimensions: Optional[List[str]] = None,
|
||
) -> str:
|
||
"""Build an evaluation prompt for a single entity."""
|
||
dims = dimensions or _DEFAULT_DIMENSIONS
|
||
dims_list = "\n".join(f"- {d}" for d in dims)
|
||
return _PROMPT_TEMPLATE.format(
|
||
topic=topic,
|
||
title=entity.title,
|
||
slug=entity.slug,
|
||
domain=entity.domain or "(unspecified)",
|
||
source_chapter=entity.source_chapter or "(unspecified)",
|
||
definition=entity.definition or "(no definition)",
|
||
context=entity.context or "(no context)",
|
||
dimensions_list=dims_list,
|
||
)
|
||
|
||
|
||
def content_digest(entity: EntityMeta) -> str:
|
||
"""Compute a content digest for incremental evaluation."""
|
||
content = f"{entity.slug}:{entity.definition}:{entity.context}:{entity.domain}"
|
||
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
||
|
||
|
||
def parse_evaluation_response(
|
||
response_text: str,
|
||
dimensions: Optional[List[str]] = None,
|
||
) -> List[ScoreEntry]:
|
||
"""Parse structured dimension scores from LLM response text.
|
||
|
||
Expects blocks of::
|
||
|
||
DIMENSION: <name>
|
||
SCORE: <1-5>
|
||
RATIONALE: <text>
|
||
"""
|
||
dims = dimensions or _DEFAULT_DIMENSIONS
|
||
scores: List[ScoreEntry] = []
|
||
current_dim = None
|
||
current_score = None
|
||
current_rationale = ""
|
||
|
||
for line in response_text.splitlines():
|
||
stripped = line.strip()
|
||
if stripped.upper().startswith("DIMENSION:"):
|
||
# Flush previous
|
||
if current_dim is not None and current_score is not None:
|
||
scores.append(ScoreEntry(
|
||
name=current_dim,
|
||
value=current_score,
|
||
max_value=5.0,
|
||
rationale=current_rationale.strip(),
|
||
))
|
||
current_dim = stripped.split(":", 1)[1].strip()
|
||
current_score = None
|
||
current_rationale = ""
|
||
elif stripped.upper().startswith("SCORE:"):
|
||
try:
|
||
current_score = float(stripped.split(":", 1)[1].strip())
|
||
except ValueError:
|
||
current_score = None
|
||
elif stripped.upper().startswith("RATIONALE:"):
|
||
current_rationale = stripped.split(":", 1)[1].strip()
|
||
elif current_dim is not None and current_score is not None:
|
||
# Continuation of rationale
|
||
if stripped:
|
||
current_rationale += " " + stripped
|
||
|
||
# Flush last
|
||
if current_dim is not None and current_score is not None:
|
||
scores.append(ScoreEntry(
|
||
name=current_dim,
|
||
value=current_score,
|
||
max_value=5.0,
|
||
rationale=current_rationale.strip(),
|
||
))
|
||
|
||
return scores
|
||
|
||
|
||
def run_entity_evaluation(
|
||
config: InfospaceConfig,
|
||
entities: List[EntityMeta],
|
||
adapter: LLMAdapter,
|
||
run_config: Optional[RunConfig] = None,
|
||
output_dir: Optional[Path] = None,
|
||
previous_digests: Optional[Dict[str, str]] = None,
|
||
progress_callback: Optional[Callable] = None,
|
||
dimensions: Optional[List[str]] = None,
|
||
) -> BatchSummary:
|
||
"""Run per-entity evaluation using the batch evaluator.
|
||
|
||
Evaluation files are written **incrementally** after each successful
|
||
result, so a long run is resumable and safe to interrupt.
|
||
|
||
Args:
|
||
config: The infospace configuration.
|
||
entities: Entities to evaluate.
|
||
adapter: LLM adapter for evaluation.
|
||
run_config: LLM execution configuration.
|
||
output_dir: Where to write evaluation results. Defaults to
|
||
``config.evaluations_dir`` relative to CWD.
|
||
previous_digests: ``{slug: digest}`` for incremental skip.
|
||
progress_callback: Called after each item.
|
||
dimensions: Custom evaluation dimensions.
|
||
|
||
Returns:
|
||
A :class:`BatchSummary` with per-entity results.
|
||
"""
|
||
topic = config.topic.name
|
||
evaluations_path = output_dir or Path(config.evaluations_dir)
|
||
evaluator_name = (run_config.model_name if run_config else "unknown")
|
||
|
||
def _write_and_notify(done: int, total: int, result) -> None:
|
||
# Write file immediately on success (incremental — run is resumable)
|
||
if result.status == "success" and result.response is not None:
|
||
scores = parse_evaluation_response(result.response.content, dimensions)
|
||
evaluation = EntityEvaluation(
|
||
entity_slug=result.key,
|
||
evaluator=evaluator_name,
|
||
scores=scores,
|
||
evaluated_at=datetime.utcnow(),
|
||
)
|
||
eval_path = evaluations_path / f"{result.key}.md"
|
||
write_entity_evaluation(evaluation, eval_path)
|
||
|
||
if progress_callback is not None:
|
||
progress_callback(done, total, result)
|
||
|
||
items = [
|
||
BatchItem(
|
||
key=entity.slug,
|
||
prompt=build_evaluation_prompt(entity, topic, dimensions),
|
||
content_digest=content_digest(entity),
|
||
metadata={"source_path": entity.source_path},
|
||
)
|
||
for entity in entities
|
||
]
|
||
|
||
evaluator = BatchEvaluator(
|
||
adapter=adapter,
|
||
config=run_config,
|
||
progress_callback=_write_and_notify,
|
||
previous_digests=previous_digests,
|
||
)
|
||
return evaluator.evaluate(items)
|