diff --git a/examples/infospace-with-history/infospace.yaml b/examples/infospace-with-history/infospace.yaml index 84396a4f..882931f6 100644 --- a/examples/infospace-with-history/infospace.yaml +++ b/examples/infospace-with-history/infospace.yaml @@ -37,6 +37,8 @@ viability: max: 0 granularity_entropy: min: 1.0 + per_entity_mean: + min: 3.5 # LLM quality score across 5 dimensions (1-5 scale) pipeline: stages: diff --git a/examples/infospace-with-history/output/metrics/history.yaml b/examples/infospace-with-history/output/metrics/history.yaml index fa4ce6af..f85b1784 100644 --- a/examples/infospace-with-history/output/metrics/history.yaml +++ b/examples/infospace-with-history/output/metrics/history.yaml @@ -934,3 +934,29 @@ concern: C1 metadata: source: collection-checks +- snapshot_id: 090bb961 + created_at: '2026-02-23T00:22:25.818146+00:00' + schema_name: default + entity_count: 988 + entity_evaluations: [] + collection_metrics: + - name: coherence_components + value: 0.0 + concern: C3 + - name: consistency_cycles + value: 0.0 + concern: C4 + - name: coverage_ratio + value: 0.6190476190476191 + concern: C2 + - name: granularity_entropy + value: 2.6747519428200657 + concern: C5 + - name: modularity + value: 0.0 + concern: C3 + - name: redundancy_ratio + value: 0.006072874493927126 + concern: C1 + metadata: + source: collection-checks diff --git a/examples/infospace-with-history/output/metrics/metrics.yaml b/examples/infospace-with-history/output/metrics/metrics.yaml index 4aa69657..2d3ceb48 100644 --- a/examples/infospace-with-history/output/metrics/metrics.yaml +++ b/examples/infospace-with-history/output/metrics/metrics.yaml @@ -1,6 +1,7 @@ coherence_components: 0.0 consistency_cycles: 0.0 -coverage_ratio: 0.442424 -granularity_entropy: 2.953326 +coverage_ratio: 0.619048 +granularity_entropy: 2.674752 modularity: 0.0 -redundancy_ratio: 0.005877 +per_entity_mean: 4.42 +redundancy_ratio: 0.006073 diff --git a/examples/infospace-with-history/templates/evaluate-entity.md b/examples/infospace-with-history/templates/evaluate-entity.md new file mode 100644 index 00000000..7eda6c37 --- /dev/null +++ b/examples/infospace-with-history/templates/evaluate-entity.md @@ -0,0 +1,70 @@ +# Evaluate Economic Entity + +You are a quality assessor evaluating a single economic entity extracted from +Adam Smith's *The Wealth of Nations* and mapped to Stafford Beer's Viable +System Model. Your task is to score the entity on five quality dimensions +and produce a structured evaluation. + +## Entity Under Evaluation + +@{entity_content} + +## Source Chapter + +@{source_chapter} + +## VSM Framework Reference + +@{vsm_framework} + +## Quality Rubric + +@{quality_rubric} + +## Instructions + +1. Read the entity carefully, including its definition, source chapter, + context, economic domain, and any VSM mapping information provided. +2. Locate the relevant passage in the source chapter to verify source grounding. +3. Consult the VSM framework reference to assess VSM relevance. +4. Score each dimension 1–5 using the rubric above. Use the full range: + reserve 5 for genuinely excellent entries and 1 for clear failures. +5. For each dimension, write exactly one sentence justifying the score. +6. Compute the overall score as the mean of the five dimension scores, + rounded to two decimal places. +7. List any flags for issues that warrant attention (empty list if none). + Valid flags: `circular-definition`, `missing-citation`, `wrong-domain`, + `no-vsm-mapping`, `redundant-with-`, `overclaimed-strength`, + `underclaimed-strength`. + +## Output Format + +Output YAML front-matter (scores + flags) followed by a markdown section +with per-dimension justifications. Do not include any other text outside +this structure. + +``` +--- +entity: +scores: + definition_precision: <1-5> + source_grounding: <1-5> + domain_placement: <1-5> + vsm_relevance: <1-5> + explanatory_value: <1-5> +overall: +flags: [] +--- + +## Justifications + +**Definition Precision (/5):** + +**Source Grounding (/5):** + +**Domain Placement (/5):** + +**VSM Relevance (/5):** + +**Explanatory Value (/5):** +``` diff --git a/markitect/infospace/cli.py b/markitect/infospace/cli.py index 6f72571d..6b3f25a0 100644 --- a/markitect/infospace/cli.py +++ b/markitect/infospace/cli.py @@ -189,11 +189,26 @@ def evaluate(config_path, provider, model, entity_slug, chapter): click.echo(f"No entities found for chapter '{chapter}'.") return + # Skip entities that already have evaluation files (incremental resume) + from markitect.infospace.evaluate import run_entity_evaluation + output_dir = root / cfg.evaluations_dir + if not entity_slug and not chapter and output_dir.is_dir(): + previous_digests = { + p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator + for p in output_dir.glob("*.md") + } + entity_list = [e for e in entity_list if e.slug not in previous_digests] + if not entity_list: + click.echo("All entities already evaluated. Nothing to do.") + return + if previous_digests: + click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.") + # Create adapter from markitect.llm import create_adapter from markitect.prompts.execution.models import RunConfig adapter = create_adapter(provider, model=model) - run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000) + run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000) # Progress callback def on_progress(done, total, result): @@ -202,8 +217,6 @@ def evaluate(config_path, provider, model, entity_slug, chapter): click.echo(f"Evaluating {len(entity_list)} entities via {provider}...") - from markitect.infospace.evaluate import run_entity_evaluation - output_dir = root / cfg.evaluations_dir summary = run_entity_evaluation( config=cfg, entities=entity_list, @@ -218,6 +231,74 @@ def evaluate(config_path, provider, model, entity_slug, chapter): click.echo(f"Tokens used: {summary.total_tokens}") +# ── eval-summary ────────────────────────────────────────────────────── + + +@infospace_commands.command(name="eval-summary") +@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.") +@click.option("--update-metrics", is_flag=True, default=False, + help="Merge per_entity_mean into metrics.yaml for viability checks.") +def eval_summary(config_path: Optional[str], update_metrics: bool): + """Show aggregate statistics from per-entity evaluation files.""" + cfg, cfg_path = _load_config_or_exit(config_path) + root = cfg_path.parent + + evaluations_dir = root / cfg.evaluations_dir + if not evaluations_dir.is_dir(): + click.echo("No evaluations directory found. Run 'markitect infospace evaluate' first.") + return + + from markitect.infospace.evaluation_io import read_entity_evaluation + + eval_files = sorted(evaluations_dir.glob("*.md")) + if not eval_files: + click.echo("No evaluation files found.") + return + + overall_scores: list = [] + dim_scores: dict = {} + failed: list = [] + + for ef in eval_files: + try: + ev = read_entity_evaluation(ef) + overall_scores.append(ev.overall_score) + for s in ev.scores: + dim_scores.setdefault(s.name, []).append(s.value) + except Exception as exc: + failed.append((ef.stem, str(exc))) + + n = len(overall_scores) + if n == 0: + click.echo("No evaluations could be read.") + return + + mean_overall = sum(overall_scores) / n + + click.echo(f"Evaluation summary — {n} entities evaluated") + if failed: + click.echo(f" (failed to read: {len(failed)})") + click.echo() + click.echo(f" {'Dimension':<30} {'Mean':>6}") + click.echo(" " + "-" * 38) + click.echo(f" {'overall':<30} {mean_overall:>6.3f}") + for dim, vals in sorted(dim_scores.items()): + click.echo(f" {dim:<30} {sum(vals)/len(vals):>6.3f}") + + score_min = min(overall_scores) + score_max = max(overall_scores) + click.echo() + click.echo(f" Range: {score_min:.2f} – {score_max:.2f}") + + if update_metrics: + from markitect.infospace.history import read_metrics_file, write_metrics_file + metrics_file = root / cfg.metrics_dir / "metrics.yaml" + existing = read_metrics_file(metrics_file) + existing["per_entity_mean"] = round(mean_overall, 6) + write_metrics_file(existing, metrics_file) + click.echo(f"\nUpdated metrics.yaml: per_entity_mean = {mean_overall:.4f}") + + # ── viability ──────────────────────────────────────────────────────── diff --git a/markitect/infospace/evaluate.py b/markitect/infospace/evaluate.py index b33e9118..b755f49d 100644 --- a/markitect/infospace/evaluate.py +++ b/markitect/infospace/evaluate.py @@ -25,9 +25,9 @@ from markitect.prompts.execution.models import RunConfig _DEFAULT_DIMENSIONS = [ "definition_precision", "source_grounding", - "domain_relevance", - "discipline_alignment", - "conceptual_clarity", + "domain_placement", + "vsm_relevance", + "explanatory_value", ] _PROMPT_TEMPLATE = """\ @@ -45,20 +45,35 @@ You are evaluating an entity from an infospace about "{topic}". ### Context {context} +## Background + +This infospace maps concepts from the source corpus to Stafford Beer's +Viable System Model (VSM). The VSM has five systems: S1 (primary operations), +S2 (coordination/anti-oscillation), S3 (internal regulation/audit), +S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this +to assess whether the entity has a natural VSM home. + +## Dimensions + +- **definition_precision**: Is the definition precise and non-circular? \ +Does it capture a distinct concept rather than a vague umbrella term? +- **source_grounding**: Is this entity grounded in the actual source text, \ +or does it introduce concepts the source does not clearly state? +- **domain_placement**: Is the economic/thematic domain assignment correct? \ +Does the entity belong in a different conceptual category? +- **vsm_relevance**: Does this entity map naturally to one or more VSM \ +systems (S1–S5), or is it VSM-neutral/too abstract to place? +- **explanatory_value**: Does this entity add genuine explanatory power — \ +illuminating a mechanism or structural relation — or does it merely name \ +a surface phenomenon? + ## Instructions -Rate this entity on each dimension below using a scale of 1-5 \ -(1 = poor, 5 = excellent). For each dimension, provide: -1. A numeric score (1-5) -2. A brief rationale (1-2 sentences) - -### Dimensions to evaluate: -{dimensions_list} +Rate each dimension 1–5 (1 = poor, 5 = excellent). Provide a brief +rationale (1–2 sentences) for each score. ## Output format -Return your evaluation as a structured list: - DIMENSION: SCORE: <1-5> RATIONALE: @@ -161,6 +176,9 @@ def run_entity_evaluation( ) -> BatchSummary: """Run per-entity evaluation using the batch evaluator. + Evaluation files are written **incrementally** after each successful + result, so a long run is resumable and safe to interrupt. + Args: config: The infospace configuration. entities: Entities to evaluate. @@ -176,6 +194,25 @@ def run_entity_evaluation( A :class:`BatchSummary` with per-entity results. """ topic = config.topic.name + evaluations_path = output_dir or Path(config.evaluations_dir) + evaluator_name = (run_config.model_name if run_config else "unknown") + + def _write_and_notify(done: int, total: int, result) -> None: + # Write file immediately on success (incremental — run is resumable) + if result.status == "success" and result.response is not None: + scores = parse_evaluation_response(result.response.content, dimensions) + evaluation = EntityEvaluation( + entity_slug=result.key, + evaluator=evaluator_name, + scores=scores, + evaluated_at=datetime.utcnow(), + ) + eval_path = evaluations_path / f"{result.key}.md" + write_entity_evaluation(evaluation, eval_path) + + if progress_callback is not None: + progress_callback(done, total, result) + items = [ BatchItem( key=entity.slug, @@ -189,27 +226,7 @@ def run_entity_evaluation( evaluator = BatchEvaluator( adapter=adapter, config=run_config, - progress_callback=progress_callback, + progress_callback=_write_and_notify, previous_digests=previous_digests, ) - summary = evaluator.evaluate(items) - - # Write successful results - evaluations_path = output_dir or Path(config.evaluations_dir) - evaluator_name = (run_config.model_name if run_config else "unknown") - - for result in summary.results: - if result.status != "success" or result.response is None: - continue - - scores = parse_evaluation_response(result.response.content, dimensions) - evaluation = EntityEvaluation( - entity_slug=result.key, - evaluator=evaluator_name, - scores=scores, - evaluated_at=datetime.utcnow(), - ) - eval_path = evaluations_path / f"{result.key}.md" - write_entity_evaluation(evaluation, eval_path) - - return summary + return evaluator.evaluate(items) diff --git a/markitect/infospace/history.py b/markitect/infospace/history.py index 12854d01..87c49fff 100644 --- a/markitect/infospace/history.py +++ b/markitect/infospace/history.py @@ -131,8 +131,11 @@ def record_check_results( metrics_dir = root / config.metrics_dir metrics = check_report.metrics() - # Save latest metrics - write_metrics_file(metrics, metrics_dir / "metrics.yaml") + # Save latest metrics — merge with existing so other metric sources + # (e.g. per-entity evaluation summary) are preserved across check runs. + existing = read_metrics_file(metrics_dir / "metrics.yaml") + merged = {**existing, **metrics} # check results overwrite on key conflict + write_metrics_file(merged, metrics_dir / "metrics.yaml") # Create and append snapshot snapshot = snapshot_from_checks(