feat(infospace): add eval-summary command and improve evaluate pipeline (S3.3)
- Fix evaluate dimensions to match template file: definition_precision, source_grounding, domain_placement, vsm_relevance, explanatory_value (was domain_relevance, discipline_alignment, conceptual_clarity) - Add VSM background context to evaluation prompt so LLM can score vsm_relevance without macro injection - Fix model_name bug: was sending literal "default" to API (HTTP 400) - Refactor run_entity_evaluation to write files incrementally via callback rather than all at once after the batch — long runs are now resumable if interrupted - Add incremental skip in CLI: entities with existing eval files are skipped automatically on re-run (acts as resume) - Add eval-summary command: reads all eval files, shows per-dimension means, optionally writes per_entity_mean to metrics.yaml - Fix record_check_results to merge rather than overwrite metrics.yaml so per_entity_mean survives subsequent check runs - Add per_entity_mean viability threshold (min: 3.5) to infospace.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -25,9 +25,9 @@ from markitect.prompts.execution.models import RunConfig
|
||||
_DEFAULT_DIMENSIONS = [
|
||||
"definition_precision",
|
||||
"source_grounding",
|
||||
"domain_relevance",
|
||||
"discipline_alignment",
|
||||
"conceptual_clarity",
|
||||
"domain_placement",
|
||||
"vsm_relevance",
|
||||
"explanatory_value",
|
||||
]
|
||||
|
||||
_PROMPT_TEMPLATE = """\
|
||||
@@ -45,20 +45,35 @@ You are evaluating an entity from an infospace about "{topic}".
|
||||
### Context
|
||||
{context}
|
||||
|
||||
## Background
|
||||
|
||||
This infospace maps concepts from the source corpus to Stafford Beer's
|
||||
Viable System Model (VSM). The VSM has five systems: S1 (primary operations),
|
||||
S2 (coordination/anti-oscillation), S3 (internal regulation/audit),
|
||||
S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this
|
||||
to assess whether the entity has a natural VSM home.
|
||||
|
||||
## Dimensions
|
||||
|
||||
- **definition_precision**: Is the definition precise and non-circular? \
|
||||
Does it capture a distinct concept rather than a vague umbrella term?
|
||||
- **source_grounding**: Is this entity grounded in the actual source text, \
|
||||
or does it introduce concepts the source does not clearly state?
|
||||
- **domain_placement**: Is the economic/thematic domain assignment correct? \
|
||||
Does the entity belong in a different conceptual category?
|
||||
- **vsm_relevance**: Does this entity map naturally to one or more VSM \
|
||||
systems (S1–S5), or is it VSM-neutral/too abstract to place?
|
||||
- **explanatory_value**: Does this entity add genuine explanatory power — \
|
||||
illuminating a mechanism or structural relation — or does it merely name \
|
||||
a surface phenomenon?
|
||||
|
||||
## Instructions
|
||||
|
||||
Rate this entity on each dimension below using a scale of 1-5 \
|
||||
(1 = poor, 5 = excellent). For each dimension, provide:
|
||||
1. A numeric score (1-5)
|
||||
2. A brief rationale (1-2 sentences)
|
||||
|
||||
### Dimensions to evaluate:
|
||||
{dimensions_list}
|
||||
Rate each dimension 1–5 (1 = poor, 5 = excellent). Provide a brief
|
||||
rationale (1–2 sentences) for each score.
|
||||
|
||||
## Output format
|
||||
|
||||
Return your evaluation as a structured list:
|
||||
|
||||
DIMENSION: <name>
|
||||
SCORE: <1-5>
|
||||
RATIONALE: <explanation>
|
||||
@@ -161,6 +176,9 @@ def run_entity_evaluation(
|
||||
) -> BatchSummary:
|
||||
"""Run per-entity evaluation using the batch evaluator.
|
||||
|
||||
Evaluation files are written **incrementally** after each successful
|
||||
result, so a long run is resumable and safe to interrupt.
|
||||
|
||||
Args:
|
||||
config: The infospace configuration.
|
||||
entities: Entities to evaluate.
|
||||
@@ -176,6 +194,25 @@ def run_entity_evaluation(
|
||||
A :class:`BatchSummary` with per-entity results.
|
||||
"""
|
||||
topic = config.topic.name
|
||||
evaluations_path = output_dir or Path(config.evaluations_dir)
|
||||
evaluator_name = (run_config.model_name if run_config else "unknown")
|
||||
|
||||
def _write_and_notify(done: int, total: int, result) -> None:
|
||||
# Write file immediately on success (incremental — run is resumable)
|
||||
if result.status == "success" and result.response is not None:
|
||||
scores = parse_evaluation_response(result.response.content, dimensions)
|
||||
evaluation = EntityEvaluation(
|
||||
entity_slug=result.key,
|
||||
evaluator=evaluator_name,
|
||||
scores=scores,
|
||||
evaluated_at=datetime.utcnow(),
|
||||
)
|
||||
eval_path = evaluations_path / f"{result.key}.md"
|
||||
write_entity_evaluation(evaluation, eval_path)
|
||||
|
||||
if progress_callback is not None:
|
||||
progress_callback(done, total, result)
|
||||
|
||||
items = [
|
||||
BatchItem(
|
||||
key=entity.slug,
|
||||
@@ -189,27 +226,7 @@ def run_entity_evaluation(
|
||||
evaluator = BatchEvaluator(
|
||||
adapter=adapter,
|
||||
config=run_config,
|
||||
progress_callback=progress_callback,
|
||||
progress_callback=_write_and_notify,
|
||||
previous_digests=previous_digests,
|
||||
)
|
||||
summary = evaluator.evaluate(items)
|
||||
|
||||
# Write successful results
|
||||
evaluations_path = output_dir or Path(config.evaluations_dir)
|
||||
evaluator_name = (run_config.model_name if run_config else "unknown")
|
||||
|
||||
for result in summary.results:
|
||||
if result.status != "success" or result.response is None:
|
||||
continue
|
||||
|
||||
scores = parse_evaluation_response(result.response.content, dimensions)
|
||||
evaluation = EntityEvaluation(
|
||||
entity_slug=result.key,
|
||||
evaluator=evaluator_name,
|
||||
scores=scores,
|
||||
evaluated_at=datetime.utcnow(),
|
||||
)
|
||||
eval_path = evaluations_path / f"{result.key}.md"
|
||||
write_entity_evaluation(evaluation, eval_path)
|
||||
|
||||
return summary
|
||||
return evaluator.evaluate(items)
|
||||
|
||||
Reference in New Issue
Block a user