feat(infospace): add eval-summary command and improve evaluate pipeline (S3.3)
- Fix evaluate dimensions to match template file: definition_precision, source_grounding, domain_placement, vsm_relevance, explanatory_value (was domain_relevance, discipline_alignment, conceptual_clarity) - Add VSM background context to evaluation prompt so LLM can score vsm_relevance without macro injection - Fix model_name bug: was sending literal "default" to API (HTTP 400) - Refactor run_entity_evaluation to write files incrementally via callback rather than all at once after the batch — long runs are now resumable if interrupted - Add incremental skip in CLI: entities with existing eval files are skipped automatically on re-run (acts as resume) - Add eval-summary command: reads all eval files, shows per-dimension means, optionally writes per_entity_mean to metrics.yaml - Fix record_check_results to merge rather than overwrite metrics.yaml so per_entity_mean survives subsequent check runs - Add per_entity_mean viability threshold (min: 3.5) to infospace.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -189,11 +189,26 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
click.echo(f"No entities found for chapter '{chapter}'.")
|
||||
return
|
||||
|
||||
# Skip entities that already have evaluation files (incremental resume)
|
||||
from markitect.infospace.evaluate import run_entity_evaluation
|
||||
output_dir = root / cfg.evaluations_dir
|
||||
if not entity_slug and not chapter and output_dir.is_dir():
|
||||
previous_digests = {
|
||||
p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator
|
||||
for p in output_dir.glob("*.md")
|
||||
}
|
||||
entity_list = [e for e in entity_list if e.slug not in previous_digests]
|
||||
if not entity_list:
|
||||
click.echo("All entities already evaluated. Nothing to do.")
|
||||
return
|
||||
if previous_digests:
|
||||
click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
|
||||
|
||||
# Create adapter
|
||||
from markitect.llm import create_adapter
|
||||
from markitect.prompts.execution.models import RunConfig
|
||||
adapter = create_adapter(provider, model=model)
|
||||
run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
|
||||
run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)
|
||||
|
||||
# Progress callback
|
||||
def on_progress(done, total, result):
|
||||
@@ -202,8 +217,6 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
|
||||
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
|
||||
|
||||
from markitect.infospace.evaluate import run_entity_evaluation
|
||||
output_dir = root / cfg.evaluations_dir
|
||||
summary = run_entity_evaluation(
|
||||
config=cfg,
|
||||
entities=entity_list,
|
||||
@@ -218,6 +231,74 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
||||
click.echo(f"Tokens used: {summary.total_tokens}")
|
||||
|
||||
|
||||
# ── eval-summary ──────────────────────────────────────────────────────
|
||||
|
||||
|
||||
@infospace_commands.command(name="eval-summary")
|
||||
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
|
||||
@click.option("--update-metrics", is_flag=True, default=False,
|
||||
help="Merge per_entity_mean into metrics.yaml for viability checks.")
|
||||
def eval_summary(config_path: Optional[str], update_metrics: bool):
|
||||
"""Show aggregate statistics from per-entity evaluation files."""
|
||||
cfg, cfg_path = _load_config_or_exit(config_path)
|
||||
root = cfg_path.parent
|
||||
|
||||
evaluations_dir = root / cfg.evaluations_dir
|
||||
if not evaluations_dir.is_dir():
|
||||
click.echo("No evaluations directory found. Run 'markitect infospace evaluate' first.")
|
||||
return
|
||||
|
||||
from markitect.infospace.evaluation_io import read_entity_evaluation
|
||||
|
||||
eval_files = sorted(evaluations_dir.glob("*.md"))
|
||||
if not eval_files:
|
||||
click.echo("No evaluation files found.")
|
||||
return
|
||||
|
||||
overall_scores: list = []
|
||||
dim_scores: dict = {}
|
||||
failed: list = []
|
||||
|
||||
for ef in eval_files:
|
||||
try:
|
||||
ev = read_entity_evaluation(ef)
|
||||
overall_scores.append(ev.overall_score)
|
||||
for s in ev.scores:
|
||||
dim_scores.setdefault(s.name, []).append(s.value)
|
||||
except Exception as exc:
|
||||
failed.append((ef.stem, str(exc)))
|
||||
|
||||
n = len(overall_scores)
|
||||
if n == 0:
|
||||
click.echo("No evaluations could be read.")
|
||||
return
|
||||
|
||||
mean_overall = sum(overall_scores) / n
|
||||
|
||||
click.echo(f"Evaluation summary — {n} entities evaluated")
|
||||
if failed:
|
||||
click.echo(f" (failed to read: {len(failed)})")
|
||||
click.echo()
|
||||
click.echo(f" {'Dimension':<30} {'Mean':>6}")
|
||||
click.echo(" " + "-" * 38)
|
||||
click.echo(f" {'overall':<30} {mean_overall:>6.3f}")
|
||||
for dim, vals in sorted(dim_scores.items()):
|
||||
click.echo(f" {dim:<30} {sum(vals)/len(vals):>6.3f}")
|
||||
|
||||
score_min = min(overall_scores)
|
||||
score_max = max(overall_scores)
|
||||
click.echo()
|
||||
click.echo(f" Range: {score_min:.2f} – {score_max:.2f}")
|
||||
|
||||
if update_metrics:
|
||||
from markitect.infospace.history import read_metrics_file, write_metrics_file
|
||||
metrics_file = root / cfg.metrics_dir / "metrics.yaml"
|
||||
existing = read_metrics_file(metrics_file)
|
||||
existing["per_entity_mean"] = round(mean_overall, 6)
|
||||
write_metrics_file(existing, metrics_file)
|
||||
click.echo(f"\nUpdated metrics.yaml: per_entity_mean = {mean_overall:.4f}")
|
||||
|
||||
|
||||
# ── viability ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user