feat(infospace): add eval-summary command and improve evaluate pipeline (S3.3)

- Fix evaluate dimensions to match template file: definition_precision, source_grounding, domain_placement, vsm_relevance, explanatory_value (was domain_relevance, discipline_alignment, conceptual_clarity) - Add VSM background context to evaluation prompt so LLM can score vsm_relevance without macro injection - Fix model_name bug: was sending literal "default" to API (HTTP 400) - Refactor run_entity_evaluation to write files incrementally via callback rather than all at once after the batch — long runs are now resumable if interrupted - Add incremental skip in CLI: entities with existing eval files are skipped automatically on re-run (acts as resume) - Add eval-summary command: reads all eval files, shows per-dimension means, optionally writes per_entity_mean to metrics.yaml - Fix record_check_results to merge rather than overwrite metrics.yaml so per_entity_mean survives subsequent check runs - Add per_entity_mean viability threshold (min: 3.5) to infospace.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-23 01:26:45 +01:00
parent 574bb11db6
commit 7f1eecbdb2
7 changed files with 242 additions and 42 deletions
--- a/markitect/infospace/evaluate.py
+++ b/markitect/infospace/evaluate.py
@@ -25,9 +25,9 @@ from markitect.prompts.execution.models import RunConfig
 _DEFAULT_DIMENSIONS = [
    "definition_precision",
    "source_grounding",
-    "domain_relevance",
-    "discipline_alignment",
-    "conceptual_clarity",
+    "domain_placement",
+    "vsm_relevance",
+    "explanatory_value",
 ]

 _PROMPT_TEMPLATE = """\
@@ -45,20 +45,35 @@ You are evaluating an entity from an infospace about "{topic}".
 ### Context
 {context}

+## Background
+
+This infospace maps concepts from the source corpus to Stafford Beer's
+Viable System Model (VSM). The VSM has five systems: S1 (primary operations),
+S2 (coordination/anti-oscillation), S3 (internal regulation/audit),
+S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this
+to assess whether the entity has a natural VSM home.
+
+## Dimensions
+
+- **definition_precision**: Is the definition precise and non-circular? \
+Does it capture a distinct concept rather than a vague umbrella term?
+- **source_grounding**: Is this entity grounded in the actual source text, \
+or does it introduce concepts the source does not clearly state?
+- **domain_placement**: Is the economic/thematic domain assignment correct? \
+Does the entity belong in a different conceptual category?
+- **vsm_relevance**: Does this entity map naturally to one or more VSM \
+systems (S1–S5), or is it VSM-neutral/too abstract to place?
+- **explanatory_value**: Does this entity add genuine explanatory power — \
+illuminating a mechanism or structural relation — or does it merely name \
+a surface phenomenon?
+
 ## Instructions

-Rate this entity on each dimension below using a scale of 1-5 \
-(1 = poor, 5 = excellent). For each dimension, provide:
-1. A numeric score (1-5)
-2. A brief rationale (1-2 sentences)
-
-### Dimensions to evaluate:
-{dimensions_list}
+Rate each dimension 1–5 (1 = poor, 5 = excellent). Provide a brief
+rationale (1–2 sentences) for each score.

 ## Output format

-Return your evaluation as a structured list:
-
 DIMENSION: <name>
 SCORE: <1-5>
 RATIONALE: <explanation>
@@ -161,6 +176,9 @@ def run_entity_evaluation(
 ) -> BatchSummary:
    """Run per-entity evaluation using the batch evaluator.

+    Evaluation files are written **incrementally** after each successful
+    result, so a long run is resumable and safe to interrupt.
+
    Args:
        config: The infospace configuration.
        entities: Entities to evaluate.
@@ -176,6 +194,25 @@ def run_entity_evaluation(
        A :class:`BatchSummary` with per-entity results.
    """
    topic = config.topic.name
+    evaluations_path = output_dir or Path(config.evaluations_dir)
+    evaluator_name = (run_config.model_name if run_config else "unknown")
+
+    def _write_and_notify(done: int, total: int, result) -> None:
+        # Write file immediately on success (incremental — run is resumable)
+        if result.status == "success" and result.response is not None:
+            scores = parse_evaluation_response(result.response.content, dimensions)
+            evaluation = EntityEvaluation(
+                entity_slug=result.key,
+                evaluator=evaluator_name,
+                scores=scores,
+                evaluated_at=datetime.utcnow(),
+            )
+            eval_path = evaluations_path / f"{result.key}.md"
+            write_entity_evaluation(evaluation, eval_path)
+
+        if progress_callback is not None:
+            progress_callback(done, total, result)
+
    items = [
        BatchItem(
            key=entity.slug,
@@ -189,27 +226,7 @@ def run_entity_evaluation(
    evaluator = BatchEvaluator(
        adapter=adapter,
        config=run_config,
-        progress_callback=progress_callback,
+        progress_callback=_write_and_notify,
        previous_digests=previous_digests,
    )
-    summary = evaluator.evaluate(items)
-
-    # Write successful results
-    evaluations_path = output_dir or Path(config.evaluations_dir)
-    evaluator_name = (run_config.model_name if run_config else "unknown")
-
-    for result in summary.results:
-        if result.status != "success" or result.response is None:
-            continue
-
-        scores = parse_evaluation_response(result.response.content, dimensions)
-        evaluation = EntityEvaluation(
-            entity_slug=result.key,
-            evaluator=evaluator_name,
-            scores=scores,
-            evaluated_at=datetime.utcnow(),
-        )
-        eval_path = evaluations_path / f"{result.key}.md"
-        write_entity_evaluation(evaluation, eval_path)
-
-    return summary
+    return evaluator.evaluate(items)