feat(infospace): add eval-summary command and improve evaluate pipeline (S3.3)

- Fix evaluate dimensions to match template file: definition_precision, source_grounding, domain_placement, vsm_relevance, explanatory_value (was domain_relevance, discipline_alignment, conceptual_clarity) - Add VSM background context to evaluation prompt so LLM can score vsm_relevance without macro injection - Fix model_name bug: was sending literal "default" to API (HTTP 400) - Refactor run_entity_evaluation to write files incrementally via callback rather than all at once after the batch — long runs are now resumable if interrupted - Add incremental skip in CLI: entities with existing eval files are skipped automatically on re-run (acts as resume) - Add eval-summary command: reads all eval files, shows per-dimension means, optionally writes per_entity_mean to metrics.yaml - Fix record_check_results to merge rather than overwrite metrics.yaml so per_entity_mean survives subsequent check runs - Add per_entity_mean viability threshold (min: 3.5) to infospace.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-23 01:26:45 +01:00
parent 574bb11db6
commit 7f1eecbdb2
7 changed files with 242 additions and 42 deletions
--- a/examples/infospace-with-history/infospace.yaml
+++ b/examples/infospace-with-history/infospace.yaml
@@ -37,6 +37,8 @@ viability:
    max: 0
  granularity_entropy:
    min: 1.0
+  per_entity_mean:
+    min: 3.5  # LLM quality score across 5 dimensions (1-5 scale)

 pipeline:
  stages:
--- a/examples/infospace-with-history/output/metrics/history.yaml
+++ b/examples/infospace-with-history/output/metrics/history.yaml
@@ -934,3 +934,29 @@
    concern: C1
  metadata:
    source: collection-checks
+- snapshot_id: 090bb961
+  created_at: '2026-02-23T00:22:25.818146+00:00'
+  schema_name: default
+  entity_count: 988
+  entity_evaluations: []
+  collection_metrics:
+  - name: coherence_components
+    value: 0.0
+    concern: C3
+  - name: consistency_cycles
+    value: 0.0
+    concern: C4
+  - name: coverage_ratio
+    value: 0.6190476190476191
+    concern: C2
+  - name: granularity_entropy
+    value: 2.6747519428200657
+    concern: C5
+  - name: modularity
+    value: 0.0
+    concern: C3
+  - name: redundancy_ratio
+    value: 0.006072874493927126
+    concern: C1
+  metadata:
+    source: collection-checks
--- a/examples/infospace-with-history/output/metrics/metrics.yaml
+++ b/examples/infospace-with-history/output/metrics/metrics.yaml
@@ -1,6 +1,7 @@
 coherence_components: 0.0
 consistency_cycles: 0.0
-coverage_ratio: 0.442424
-granularity_entropy: 2.953326
+coverage_ratio: 0.619048
+granularity_entropy: 2.674752
 modularity: 0.0
-redundancy_ratio: 0.005877
+per_entity_mean: 4.42
+redundancy_ratio: 0.006073
--- a/examples/infospace-with-history/templates/evaluate-entity.md
+++ b/examples/infospace-with-history/templates/evaluate-entity.md
@@ -0,0 +1,70 @@
+# Evaluate Economic Entity
+
+You are a quality assessor evaluating a single economic entity extracted from
+Adam Smith's *The Wealth of Nations* and mapped to Stafford Beer's Viable
+System Model. Your task is to score the entity on five quality dimensions
+and produce a structured evaluation.
+
+## Entity Under Evaluation
+
+@{entity_content}
+
+## Source Chapter
+
+@{source_chapter}
+
+## VSM Framework Reference
+
+@{vsm_framework}
+
+## Quality Rubric
+
+@{quality_rubric}
+
+## Instructions
+
+1. Read the entity carefully, including its definition, source chapter,
+   context, economic domain, and any VSM mapping information provided.
+2. Locate the relevant passage in the source chapter to verify source grounding.
+3. Consult the VSM framework reference to assess VSM relevance.
+4. Score each dimension 1–5 using the rubric above. Use the full range:
+   reserve 5 for genuinely excellent entries and 1 for clear failures.
+5. For each dimension, write exactly one sentence justifying the score.
+6. Compute the overall score as the mean of the five dimension scores,
+   rounded to two decimal places.
+7. List any flags for issues that warrant attention (empty list if none).
+   Valid flags: `circular-definition`, `missing-citation`, `wrong-domain`,
+   `no-vsm-mapping`, `redundant-with-<slug>`, `overclaimed-strength`,
+   `underclaimed-strength`.
+
+## Output Format
+
+Output YAML front-matter (scores + flags) followed by a markdown section
+with per-dimension justifications. Do not include any other text outside
+this structure.
+
+```
+---
+entity: <slug of the entity, kebab-case>
+scores:
+  definition_precision: <1-5>
+  source_grounding: <1-5>
+  domain_placement: <1-5>
+  vsm_relevance: <1-5>
+  explanatory_value: <1-5>
+overall: <mean rounded to 2 decimal places>
+flags: []
+---
+
+## Justifications
+
+**Definition Precision (<score>/5):** <one sentence>
+
+**Source Grounding (<score>/5):** <one sentence>
+
+**Domain Placement (<score>/5):** <one sentence>
+
+**VSM Relevance (<score>/5):** <one sentence>
+
+**Explanatory Value (<score>/5):** <one sentence>
+```
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -189,11 +189,26 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
            click.echo(f"No entities found for chapter '{chapter}'.")
            return

+    # Skip entities that already have evaluation files (incremental resume)
+    from markitect.infospace.evaluate import run_entity_evaluation
+    output_dir = root / cfg.evaluations_dir
+    if not entity_slug and not chapter and output_dir.is_dir():
+        previous_digests = {
+            p.stem: ""  # non-empty sentinel → triggers skip in BatchEvaluator
+            for p in output_dir.glob("*.md")
+        }
+        entity_list = [e for e in entity_list if e.slug not in previous_digests]
+        if not entity_list:
+            click.echo("All entities already evaluated. Nothing to do.")
+            return
+        if previous_digests:
+            click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
+
    # Create adapter
    from markitect.llm import create_adapter
    from markitect.prompts.execution.models import RunConfig
    adapter = create_adapter(provider, model=model)
-    run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
+    run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)

    # Progress callback
    def on_progress(done, total, result):
@@ -202,8 +217,6 @@ def evaluate(config_path, provider, model, entity_slug, chapter):

    click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")

-    from markitect.infospace.evaluate import run_entity_evaluation
-    output_dir = root / cfg.evaluations_dir
    summary = run_entity_evaluation(
        config=cfg,
        entities=entity_list,
@@ -218,6 +231,74 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
        click.echo(f"Tokens used: {summary.total_tokens}")


+# ── eval-summary ──────────────────────────────────────────────────────
+
+
+@infospace_commands.command(name="eval-summary")
+@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
+@click.option("--update-metrics", is_flag=True, default=False,
+              help="Merge per_entity_mean into metrics.yaml for viability checks.")
+def eval_summary(config_path: Optional[str], update_metrics: bool):
+    """Show aggregate statistics from per-entity evaluation files."""
+    cfg, cfg_path = _load_config_or_exit(config_path)
+    root = cfg_path.parent
+
+    evaluations_dir = root / cfg.evaluations_dir
+    if not evaluations_dir.is_dir():
+        click.echo("No evaluations directory found. Run 'markitect infospace evaluate' first.")
+        return
+
+    from markitect.infospace.evaluation_io import read_entity_evaluation
+
+    eval_files = sorted(evaluations_dir.glob("*.md"))
+    if not eval_files:
+        click.echo("No evaluation files found.")
+        return
+
+    overall_scores: list = []
+    dim_scores: dict = {}
+    failed: list = []
+
+    for ef in eval_files:
+        try:
+            ev = read_entity_evaluation(ef)
+            overall_scores.append(ev.overall_score)
+            for s in ev.scores:
+                dim_scores.setdefault(s.name, []).append(s.value)
+        except Exception as exc:
+            failed.append((ef.stem, str(exc)))
+
+    n = len(overall_scores)
+    if n == 0:
+        click.echo("No evaluations could be read.")
+        return
+
+    mean_overall = sum(overall_scores) / n
+
+    click.echo(f"Evaluation summary — {n} entities evaluated")
+    if failed:
+        click.echo(f"  (failed to read: {len(failed)})")
+    click.echo()
+    click.echo(f"  {'Dimension':<30} {'Mean':>6}")
+    click.echo("  " + "-" * 38)
+    click.echo(f"  {'overall':<30} {mean_overall:>6.3f}")
+    for dim, vals in sorted(dim_scores.items()):
+        click.echo(f"  {dim:<30} {sum(vals)/len(vals):>6.3f}")
+
+    score_min = min(overall_scores)
+    score_max = max(overall_scores)
+    click.echo()
+    click.echo(f"  Range: {score_min:.2f} – {score_max:.2f}")
+
+    if update_metrics:
+        from markitect.infospace.history import read_metrics_file, write_metrics_file
+        metrics_file = root / cfg.metrics_dir / "metrics.yaml"
+        existing = read_metrics_file(metrics_file)
+        existing["per_entity_mean"] = round(mean_overall, 6)
+        write_metrics_file(existing, metrics_file)
+        click.echo(f"\nUpdated metrics.yaml: per_entity_mean = {mean_overall:.4f}")
+
+
 # ── viability ────────────────────────────────────────────────────────


--- a/markitect/infospace/evaluate.py
+++ b/markitect/infospace/evaluate.py
@@ -25,9 +25,9 @@ from markitect.prompts.execution.models import RunConfig
 _DEFAULT_DIMENSIONS = [
    "definition_precision",
    "source_grounding",
-    "domain_relevance",
-    "discipline_alignment",
-    "conceptual_clarity",
+    "domain_placement",
+    "vsm_relevance",
+    "explanatory_value",
 ]

 _PROMPT_TEMPLATE = """\
@@ -45,20 +45,35 @@ You are evaluating an entity from an infospace about "{topic}".
 ### Context
 {context}

+## Background
+
+This infospace maps concepts from the source corpus to Stafford Beer's
+Viable System Model (VSM). The VSM has five systems: S1 (primary operations),
+S2 (coordination/anti-oscillation), S3 (internal regulation/audit),
+S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this
+to assess whether the entity has a natural VSM home.
+
+## Dimensions
+
+- **definition_precision**: Is the definition precise and non-circular? \
+Does it capture a distinct concept rather than a vague umbrella term?
+- **source_grounding**: Is this entity grounded in the actual source text, \
+or does it introduce concepts the source does not clearly state?
+- **domain_placement**: Is the economic/thematic domain assignment correct? \
+Does the entity belong in a different conceptual category?
+- **vsm_relevance**: Does this entity map naturally to one or more VSM \
+systems (S1–S5), or is it VSM-neutral/too abstract to place?
+- **explanatory_value**: Does this entity add genuine explanatory power — \
+illuminating a mechanism or structural relation — or does it merely name \
+a surface phenomenon?
+
 ## Instructions

-Rate this entity on each dimension below using a scale of 1-5 \
-(1 = poor, 5 = excellent). For each dimension, provide:
-1. A numeric score (1-5)
-2. A brief rationale (1-2 sentences)
-
-### Dimensions to evaluate:
-{dimensions_list}
+Rate each dimension 1–5 (1 = poor, 5 = excellent). Provide a brief
+rationale (1–2 sentences) for each score.

 ## Output format

-Return your evaluation as a structured list:
-
 DIMENSION: <name>
 SCORE: <1-5>
 RATIONALE: <explanation>
@@ -161,6 +176,9 @@ def run_entity_evaluation(
 ) -> BatchSummary:
    """Run per-entity evaluation using the batch evaluator.

+    Evaluation files are written **incrementally** after each successful
+    result, so a long run is resumable and safe to interrupt.
+
    Args:
        config: The infospace configuration.
        entities: Entities to evaluate.
@@ -176,6 +194,25 @@ def run_entity_evaluation(
        A :class:`BatchSummary` with per-entity results.
    """
    topic = config.topic.name
+    evaluations_path = output_dir or Path(config.evaluations_dir)
+    evaluator_name = (run_config.model_name if run_config else "unknown")
+
+    def _write_and_notify(done: int, total: int, result) -> None:
+        # Write file immediately on success (incremental — run is resumable)
+        if result.status == "success" and result.response is not None:
+            scores = parse_evaluation_response(result.response.content, dimensions)
+            evaluation = EntityEvaluation(
+                entity_slug=result.key,
+                evaluator=evaluator_name,
+                scores=scores,
+                evaluated_at=datetime.utcnow(),
+            )
+            eval_path = evaluations_path / f"{result.key}.md"
+            write_entity_evaluation(evaluation, eval_path)
+
+        if progress_callback is not None:
+            progress_callback(done, total, result)
+
    items = [
        BatchItem(
            key=entity.slug,
@@ -189,27 +226,7 @@ def run_entity_evaluation(
    evaluator = BatchEvaluator(
        adapter=adapter,
        config=run_config,
-        progress_callback=progress_callback,
+        progress_callback=_write_and_notify,
        previous_digests=previous_digests,
    )
-    summary = evaluator.evaluate(items)
-
-    # Write successful results
-    evaluations_path = output_dir or Path(config.evaluations_dir)
-    evaluator_name = (run_config.model_name if run_config else "unknown")
-
-    for result in summary.results:
-        if result.status != "success" or result.response is None:
-            continue
-
-        scores = parse_evaluation_response(result.response.content, dimensions)
-        evaluation = EntityEvaluation(
-            entity_slug=result.key,
-            evaluator=evaluator_name,
-            scores=scores,
-            evaluated_at=datetime.utcnow(),
-        )
-        eval_path = evaluations_path / f"{result.key}.md"
-        write_entity_evaluation(evaluation, eval_path)
-
-    return summary
+    return evaluator.evaluate(items)
--- a/markitect/infospace/history.py
+++ b/markitect/infospace/history.py
@@ -131,8 +131,11 @@ def record_check_results(
    metrics_dir = root / config.metrics_dir
    metrics = check_report.metrics()

-    # Save latest metrics
-    write_metrics_file(metrics, metrics_dir / "metrics.yaml")
+    # Save latest metrics — merge with existing so other metric sources
+    # (e.g. per-entity evaluation summary) are preserved across check runs.
+    existing = read_metrics_file(metrics_dir / "metrics.yaml")
+    merged = {**existing, **metrics}  # check results overwrite on key conflict
+    write_metrics_file(merged, metrics_dir / "metrics.yaml")

    # Create and append snapshot
    snapshot = snapshot_from_checks(