feat(infospace): add eval-summary command and improve evaluate pipeline (S3.3)

- Fix evaluate dimensions to match template file: definition_precision, source_grounding, domain_placement, vsm_relevance, explanatory_value (was domain_relevance, discipline_alignment, conceptual_clarity) - Add VSM background context to evaluation prompt so LLM can score vsm_relevance without macro injection - Fix model_name bug: was sending literal "default" to API (HTTP 400) - Refactor run_entity_evaluation to write files incrementally via callback rather than all at once after the batch — long runs are now resumable if interrupted - Add incremental skip in CLI: entities with existing eval files are skipped automatically on re-run (acts as resume) - Add eval-summary command: reads all eval files, shows per-dimension means, optionally writes per_entity_mean to metrics.yaml - Fix record_check_results to merge rather than overwrite metrics.yaml so per_entity_mean survives subsequent check runs - Add per_entity_mean viability threshold (min: 3.5) to infospace.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-23 01:26:45 +01:00
parent 574bb11db6
commit 7f1eecbdb2
7 changed files with 242 additions and 42 deletions
--- a/examples/infospace-with-history/infospace.yaml
+++ b/examples/infospace-with-history/infospace.yaml
@@ -37,6 +37,8 @@ viability:
    max: 0
  granularity_entropy:
    min: 1.0
  per_entity_mean:
    min: 3.5  # LLM quality score across 5 dimensions (1-5 scale)
 pipeline:
  stages:
--- a/examples/infospace-with-history/output/metrics/history.yaml
+++ b/examples/infospace-with-history/output/metrics/history.yaml
@@ -934,3 +934,29 @@
    concern: C1
  metadata:
    source: collection-checks
 - snapshot_id: 090bb961
  created_at: '2026-02-23T00:22:25.818146+00:00'
  schema_name: default
  entity_count: 988
  entity_evaluations: []
  collection_metrics:
  - name: coherence_components
    value: 0.0
    concern: C3
  - name: consistency_cycles
    value: 0.0
    concern: C4
  - name: coverage_ratio
    value: 0.6190476190476191
    concern: C2
  - name: granularity_entropy
    value: 2.6747519428200657
    concern: C5
  - name: modularity
    value: 0.0
    concern: C3
  - name: redundancy_ratio
    value: 0.006072874493927126
    concern: C1
  metadata:
    source: collection-checks
--- a/examples/infospace-with-history/output/metrics/metrics.yaml
+++ b/examples/infospace-with-history/output/metrics/metrics.yaml
@@ -1,6 +1,7 @@
 coherence_components: 0.0
 consistency_cycles: 0.0
-coverage_ratio: 0.442424
+coverage_ratio: 0.619048
-granularity_entropy: 2.953326
+granularity_entropy: 2.674752
 modularity: 0.0
-redundancy_ratio: 0.005877
+per_entity_mean: 4.42
 redundancy_ratio: 0.006073
--- a/examples/infospace-with-history/templates/evaluate-entity.md
+++ b/examples/infospace-with-history/templates/evaluate-entity.md
@@ -0,0 +1,70 @@
 # Evaluate Economic Entity
 You are a quality assessor evaluating a single economic entity extracted from
 Adam Smith's *The Wealth of Nations* and mapped to Stafford Beer's Viable
 System Model. Your task is to score the entity on five quality dimensions
 and produce a structured evaluation.
 ## Entity Under Evaluation
@{entity_content}
 ## Source Chapter
@{source_chapter}
 ## VSM Framework Reference
@{vsm_framework}
 ## Quality Rubric
@{quality_rubric}
 ## Instructions
 1. Read the entity carefully, including its definition, source chapter,
   context, economic domain, and any VSM mapping information provided.
 2. Locate the relevant passage in the source chapter to verify source grounding.
 3. Consult the VSM framework reference to assess VSM relevance.
 4. Score each dimension 1–5 using the rubric above. Use the full range:
   reserve 5 for genuinely excellent entries and 1 for clear failures.
 5. For each dimension, write exactly one sentence justifying the score.
 6. Compute the overall score as the mean of the five dimension scores,
   rounded to two decimal places.
 7. List any flags for issues that warrant attention (empty list if none).
   Valid flags: `circular-definition`, `missing-citation`, `wrong-domain`,
   `no-vsm-mapping`, `redundant-with-<slug>`, `overclaimed-strength`,
   `underclaimed-strength`.
 ## Output Format
 Output YAML front-matter (scores + flags) followed by a markdown section
 with per-dimension justifications. Do not include any other text outside
 this structure.
 ```
 ---
 entity: <slug of the entity, kebab-case>
 scores:
  definition_precision: <1-5>
  source_grounding: <1-5>
  domain_placement: <1-5>
  vsm_relevance: <1-5>
  explanatory_value: <1-5>
 overall: <mean rounded to 2 decimal places>
 flags: []
 ---
 ## Justifications
 **Definition Precision (<score>/5):** <one sentence>
 **Source Grounding (<score>/5):** <one sentence>
 **Domain Placement (<score>/5):** <one sentence>
 **VSM Relevance (<score>/5):** <one sentence>
 **Explanatory Value (<score>/5):** <one sentence>
 ```
--- a/markitect/infospace/cli.py
+++ b/markitect/infospace/cli.py
@@ -189,11 +189,26 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
            click.echo(f"No entities found for chapter '{chapter}'.")
            return
    # Skip entities that already have evaluation files (incremental resume)
    from markitect.infospace.evaluate import run_entity_evaluation
    output_dir = root / cfg.evaluations_dir
    if not entity_slug and not chapter and output_dir.is_dir():
        previous_digests = {
            p.stem: ""  # non-empty sentinel → triggers skip in BatchEvaluator
            for p in output_dir.glob("*.md")
        }
        entity_list = [e for e in entity_list if e.slug not in previous_digests]
        if not entity_list:
            click.echo("All entities already evaluated. Nothing to do.")
            return
        if previous_digests:
            click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
    # Create adapter
    from markitect.llm import create_adapter
    from markitect.prompts.execution.models import RunConfig
    adapter = create_adapter(provider, model=model)
-    run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
+    run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)
    # Progress callback
    def on_progress(done, total, result):
@@ -202,8 +217,6 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
    click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
    from markitect.infospace.evaluate import run_entity_evaluation
    output_dir = root / cfg.evaluations_dir
    summary = run_entity_evaluation(
        config=cfg,
        entities=entity_list,
@@ -218,6 +231,74 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
        click.echo(f"Tokens used: {summary.total_tokens}")
 # ── eval-summary ──────────────────────────────────────────────────────
@infospace_commands.command(name="eval-summary")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--update-metrics", is_flag=True, default=False,
              help="Merge per_entity_mean into metrics.yaml for viability checks.")
 def eval_summary(config_path: Optional[str], update_metrics: bool):
    """Show aggregate statistics from per-entity evaluation files."""
    cfg, cfg_path = _load_config_or_exit(config_path)
    root = cfg_path.parent
    evaluations_dir = root / cfg.evaluations_dir
    if not evaluations_dir.is_dir():
        click.echo("No evaluations directory found. Run 'markitect infospace evaluate' first.")
        return
    from markitect.infospace.evaluation_io import read_entity_evaluation
    eval_files = sorted(evaluations_dir.glob("*.md"))
    if not eval_files:
        click.echo("No evaluation files found.")
        return
    overall_scores: list = []
    dim_scores: dict = {}
    failed: list = []
    for ef in eval_files:
        try:
            ev = read_entity_evaluation(ef)
            overall_scores.append(ev.overall_score)
            for s in ev.scores:
                dim_scores.setdefault(s.name, []).append(s.value)
        except Exception as exc:
            failed.append((ef.stem, str(exc)))
    n = len(overall_scores)
    if n == 0:
        click.echo("No evaluations could be read.")
        return
    mean_overall = sum(overall_scores) / n
    click.echo(f"Evaluation summary — {n} entities evaluated")
    if failed:
        click.echo(f"  (failed to read: {len(failed)})")
    click.echo()
    click.echo(f"  {'Dimension':<30} {'Mean':>6}")
    click.echo("  " + "-" * 38)
    click.echo(f"  {'overall':<30} {mean_overall:>6.3f}")
    for dim, vals in sorted(dim_scores.items()):
        click.echo(f"  {dim:<30} {sum(vals)/len(vals):>6.3f}")
    score_min = min(overall_scores)
    score_max = max(overall_scores)
    click.echo()
    click.echo(f"  Range: {score_min:.2f} – {score_max:.2f}")
    if update_metrics:
        from markitect.infospace.history import read_metrics_file, write_metrics_file
        metrics_file = root / cfg.metrics_dir / "metrics.yaml"
        existing = read_metrics_file(metrics_file)
        existing["per_entity_mean"] = round(mean_overall, 6)
        write_metrics_file(existing, metrics_file)
        click.echo(f"\nUpdated metrics.yaml: per_entity_mean = {mean_overall:.4f}")
 # ── viability ────────────────────────────────────────────────────────
--- a/markitect/infospace/evaluate.py
+++ b/markitect/infospace/evaluate.py
@@ -25,9 +25,9 @@ from markitect.prompts.execution.models import RunConfig
 _DEFAULT_DIMENSIONS = [
    "definition_precision",
    "source_grounding",
-    "domain_relevance",
+    "domain_placement",
-    "discipline_alignment",
+    "vsm_relevance",
-    "conceptual_clarity",
+    "explanatory_value",
 ]
 _PROMPT_TEMPLATE = """\
@@ -45,20 +45,35 @@ You are evaluating an entity from an infospace about "{topic}".
 ### Context
 {context}
 ## Background
 This infospace maps concepts from the source corpus to Stafford Beer's
 Viable System Model (VSM). The VSM has five systems: S1 (primary operations),
 S2 (coordination/anti-oscillation), S3 (internal regulation/audit),
 S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this
 to assess whether the entity has a natural VSM home.
 ## Dimensions
 - **definition_precision**: Is the definition precise and non-circular? \
 Does it capture a distinct concept rather than a vague umbrella term?
 - **source_grounding**: Is this entity grounded in the actual source text, \
 or does it introduce concepts the source does not clearly state?
 - **domain_placement**: Is the economic/thematic domain assignment correct? \
 Does the entity belong in a different conceptual category?
 - **vsm_relevance**: Does this entity map naturally to one or more VSM \
 systems (S1–S5), or is it VSM-neutral/too abstract to place?
 - **explanatory_value**: Does this entity add genuine explanatory power — \
 illuminating a mechanism or structural relation — or does it merely name \
 a surface phenomenon?
 ## Instructions
-Rate this entity on each dimension below using a scale of 1-5 \
+Rate each dimension 1–5 (1 = poor, 5 = excellent). Provide a brief
-(1 = poor, 5 = excellent). For each dimension, provide:
+rationale (1–2 sentences) for each score.
 1. A numeric score (1-5)
 2. A brief rationale (1-2 sentences)
 ### Dimensions to evaluate:
 {dimensions_list}
 ## Output format
 Return your evaluation as a structured list:
 DIMENSION: <name>
 SCORE: <1-5>
 RATIONALE: <explanation>
@@ -161,6 +176,9 @@ def run_entity_evaluation(
 ) -> BatchSummary:
    """Run per-entity evaluation using the batch evaluator.
    Evaluation files are written **incrementally** after each successful
    result, so a long run is resumable and safe to interrupt.
    Args:
        config: The infospace configuration.
        entities: Entities to evaluate.
@@ -176,6 +194,25 @@ def run_entity_evaluation(
        A :class:`BatchSummary` with per-entity results.
    """
    topic = config.topic.name
    evaluations_path = output_dir or Path(config.evaluations_dir)
    evaluator_name = (run_config.model_name if run_config else "unknown")
    def _write_and_notify(done: int, total: int, result) -> None:
        # Write file immediately on success (incremental — run is resumable)
        if result.status == "success" and result.response is not None:
            scores = parse_evaluation_response(result.response.content, dimensions)
            evaluation = EntityEvaluation(
                entity_slug=result.key,
                evaluator=evaluator_name,
                scores=scores,
                evaluated_at=datetime.utcnow(),
            )
            eval_path = evaluations_path / f"{result.key}.md"
            write_entity_evaluation(evaluation, eval_path)
        if progress_callback is not None:
            progress_callback(done, total, result)
    items = [
        BatchItem(
            key=entity.slug,
@@ -189,27 +226,7 @@ def run_entity_evaluation(
    evaluator = BatchEvaluator(
        adapter=adapter,
        config=run_config,
-        progress_callback=progress_callback,
+        progress_callback=_write_and_notify,
        previous_digests=previous_digests,
    )
-    summary = evaluator.evaluate(items)
+    return evaluator.evaluate(items)
    # Write successful results
    evaluations_path = output_dir or Path(config.evaluations_dir)
    evaluator_name = (run_config.model_name if run_config else "unknown")
    for result in summary.results:
        if result.status != "success" or result.response is None:
            continue
        scores = parse_evaluation_response(result.response.content, dimensions)
        evaluation = EntityEvaluation(
            entity_slug=result.key,
            evaluator=evaluator_name,
            scores=scores,
            evaluated_at=datetime.utcnow(),
        )
        eval_path = evaluations_path / f"{result.key}.md"
        write_entity_evaluation(evaluation, eval_path)
    return summary
--- a/markitect/infospace/history.py
+++ b/markitect/infospace/history.py
@@ -131,8 +131,11 @@ def record_check_results(
    metrics_dir = root / config.metrics_dir
    metrics = check_report.metrics()
-    # Save latest metrics
+    # Save latest metrics — merge with existing so other metric sources
-    write_metrics_file(metrics, metrics_dir / "metrics.yaml")
+    # (e.g. per-entity evaluation summary) are preserved across check runs.
    existing = read_metrics_file(metrics_dir / "metrics.yaml")
    merged = {**existing, **metrics}  # check results overwrite on key conflict
    write_metrics_file(merged, metrics_dir / "metrics.yaml")
    # Create and append snapshot
    snapshot = snapshot_from_checks(