feat(infospace): add eval-summary command and improve evaluate pipeline (S3.3)

- Fix evaluate dimensions to match template file:
  definition_precision, source_grounding, domain_placement,
  vsm_relevance, explanatory_value (was domain_relevance,
  discipline_alignment, conceptual_clarity)
- Add VSM background context to evaluation prompt so LLM can
  score vsm_relevance without macro injection
- Fix model_name bug: was sending literal "default" to API (HTTP 400)
- Refactor run_entity_evaluation to write files incrementally via
  callback rather than all at once after the batch — long runs are
  now resumable if interrupted
- Add incremental skip in CLI: entities with existing eval files
  are skipped automatically on re-run (acts as resume)
- Add eval-summary command: reads all eval files, shows per-dimension
  means, optionally writes per_entity_mean to metrics.yaml
- Fix record_check_results to merge rather than overwrite metrics.yaml
  so per_entity_mean survives subsequent check runs
- Add per_entity_mean viability threshold (min: 3.5) to infospace.yaml

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-02-23 01:26:45 +01:00
parent 574bb11db6
commit 7f1eecbdb2
7 changed files with 242 additions and 42 deletions

View File

@@ -37,6 +37,8 @@ viability:
max: 0
granularity_entropy:
min: 1.0
per_entity_mean:
min: 3.5 # LLM quality score across 5 dimensions (1-5 scale)
pipeline:
stages:

View File

@@ -934,3 +934,29 @@
concern: C1
metadata:
source: collection-checks
- snapshot_id: 090bb961
created_at: '2026-02-23T00:22:25.818146+00:00'
schema_name: default
entity_count: 988
entity_evaluations: []
collection_metrics:
- name: coherence_components
value: 0.0
concern: C3
- name: consistency_cycles
value: 0.0
concern: C4
- name: coverage_ratio
value: 0.6190476190476191
concern: C2
- name: granularity_entropy
value: 2.6747519428200657
concern: C5
- name: modularity
value: 0.0
concern: C3
- name: redundancy_ratio
value: 0.006072874493927126
concern: C1
metadata:
source: collection-checks

View File

@@ -1,6 +1,7 @@
coherence_components: 0.0
consistency_cycles: 0.0
coverage_ratio: 0.442424
granularity_entropy: 2.953326
coverage_ratio: 0.619048
granularity_entropy: 2.674752
modularity: 0.0
redundancy_ratio: 0.005877
per_entity_mean: 4.42
redundancy_ratio: 0.006073

View File

@@ -0,0 +1,70 @@
# Evaluate Economic Entity
You are a quality assessor evaluating a single economic entity extracted from
Adam Smith's *The Wealth of Nations* and mapped to Stafford Beer's Viable
System Model. Your task is to score the entity on five quality dimensions
and produce a structured evaluation.
## Entity Under Evaluation
@{entity_content}
## Source Chapter
@{source_chapter}
## VSM Framework Reference
@{vsm_framework}
## Quality Rubric
@{quality_rubric}
## Instructions
1. Read the entity carefully, including its definition, source chapter,
context, economic domain, and any VSM mapping information provided.
2. Locate the relevant passage in the source chapter to verify source grounding.
3. Consult the VSM framework reference to assess VSM relevance.
4. Score each dimension 15 using the rubric above. Use the full range:
reserve 5 for genuinely excellent entries and 1 for clear failures.
5. For each dimension, write exactly one sentence justifying the score.
6. Compute the overall score as the mean of the five dimension scores,
rounded to two decimal places.
7. List any flags for issues that warrant attention (empty list if none).
Valid flags: `circular-definition`, `missing-citation`, `wrong-domain`,
`no-vsm-mapping`, `redundant-with-<slug>`, `overclaimed-strength`,
`underclaimed-strength`.
## Output Format
Output YAML front-matter (scores + flags) followed by a markdown section
with per-dimension justifications. Do not include any other text outside
this structure.
```
---
entity: <slug of the entity, kebab-case>
scores:
definition_precision: <1-5>
source_grounding: <1-5>
domain_placement: <1-5>
vsm_relevance: <1-5>
explanatory_value: <1-5>
overall: <mean rounded to 2 decimal places>
flags: []
---
## Justifications
**Definition Precision (<score>/5):** <one sentence>
**Source Grounding (<score>/5):** <one sentence>
**Domain Placement (<score>/5):** <one sentence>
**VSM Relevance (<score>/5):** <one sentence>
**Explanatory Value (<score>/5):** <one sentence>
```

View File

@@ -189,11 +189,26 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
click.echo(f"No entities found for chapter '{chapter}'.")
return
# Skip entities that already have evaluation files (incremental resume)
from markitect.infospace.evaluate import run_entity_evaluation
output_dir = root / cfg.evaluations_dir
if not entity_slug and not chapter and output_dir.is_dir():
previous_digests = {
p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator
for p in output_dir.glob("*.md")
}
entity_list = [e for e in entity_list if e.slug not in previous_digests]
if not entity_list:
click.echo("All entities already evaluated. Nothing to do.")
return
if previous_digests:
click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
# Create adapter
from markitect.llm import create_adapter
from markitect.prompts.execution.models import RunConfig
adapter = create_adapter(provider, model=model)
run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)
# Progress callback
def on_progress(done, total, result):
@@ -202,8 +217,6 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
from markitect.infospace.evaluate import run_entity_evaluation
output_dir = root / cfg.evaluations_dir
summary = run_entity_evaluation(
config=cfg,
entities=entity_list,
@@ -218,6 +231,74 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
click.echo(f"Tokens used: {summary.total_tokens}")
# ── eval-summary ──────────────────────────────────────────────────────
@infospace_commands.command(name="eval-summary")
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
@click.option("--update-metrics", is_flag=True, default=False,
help="Merge per_entity_mean into metrics.yaml for viability checks.")
def eval_summary(config_path: Optional[str], update_metrics: bool):
"""Show aggregate statistics from per-entity evaluation files."""
cfg, cfg_path = _load_config_or_exit(config_path)
root = cfg_path.parent
evaluations_dir = root / cfg.evaluations_dir
if not evaluations_dir.is_dir():
click.echo("No evaluations directory found. Run 'markitect infospace evaluate' first.")
return
from markitect.infospace.evaluation_io import read_entity_evaluation
eval_files = sorted(evaluations_dir.glob("*.md"))
if not eval_files:
click.echo("No evaluation files found.")
return
overall_scores: list = []
dim_scores: dict = {}
failed: list = []
for ef in eval_files:
try:
ev = read_entity_evaluation(ef)
overall_scores.append(ev.overall_score)
for s in ev.scores:
dim_scores.setdefault(s.name, []).append(s.value)
except Exception as exc:
failed.append((ef.stem, str(exc)))
n = len(overall_scores)
if n == 0:
click.echo("No evaluations could be read.")
return
mean_overall = sum(overall_scores) / n
click.echo(f"Evaluation summary — {n} entities evaluated")
if failed:
click.echo(f" (failed to read: {len(failed)})")
click.echo()
click.echo(f" {'Dimension':<30} {'Mean':>6}")
click.echo(" " + "-" * 38)
click.echo(f" {'overall':<30} {mean_overall:>6.3f}")
for dim, vals in sorted(dim_scores.items()):
click.echo(f" {dim:<30} {sum(vals)/len(vals):>6.3f}")
score_min = min(overall_scores)
score_max = max(overall_scores)
click.echo()
click.echo(f" Range: {score_min:.2f} {score_max:.2f}")
if update_metrics:
from markitect.infospace.history import read_metrics_file, write_metrics_file
metrics_file = root / cfg.metrics_dir / "metrics.yaml"
existing = read_metrics_file(metrics_file)
existing["per_entity_mean"] = round(mean_overall, 6)
write_metrics_file(existing, metrics_file)
click.echo(f"\nUpdated metrics.yaml: per_entity_mean = {mean_overall:.4f}")
# ── viability ────────────────────────────────────────────────────────

View File

@@ -25,9 +25,9 @@ from markitect.prompts.execution.models import RunConfig
_DEFAULT_DIMENSIONS = [
"definition_precision",
"source_grounding",
"domain_relevance",
"discipline_alignment",
"conceptual_clarity",
"domain_placement",
"vsm_relevance",
"explanatory_value",
]
_PROMPT_TEMPLATE = """\
@@ -45,20 +45,35 @@ You are evaluating an entity from an infospace about "{topic}".
### Context
{context}
## Background
This infospace maps concepts from the source corpus to Stafford Beer's
Viable System Model (VSM). The VSM has five systems: S1 (primary operations),
S2 (coordination/anti-oscillation), S3 (internal regulation/audit),
S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this
to assess whether the entity has a natural VSM home.
## Dimensions
- **definition_precision**: Is the definition precise and non-circular? \
Does it capture a distinct concept rather than a vague umbrella term?
- **source_grounding**: Is this entity grounded in the actual source text, \
or does it introduce concepts the source does not clearly state?
- **domain_placement**: Is the economic/thematic domain assignment correct? \
Does the entity belong in a different conceptual category?
- **vsm_relevance**: Does this entity map naturally to one or more VSM \
systems (S1S5), or is it VSM-neutral/too abstract to place?
- **explanatory_value**: Does this entity add genuine explanatory power — \
illuminating a mechanism or structural relation — or does it merely name \
a surface phenomenon?
## Instructions
Rate this entity on each dimension below using a scale of 1-5 \
(1 = poor, 5 = excellent). For each dimension, provide:
1. A numeric score (1-5)
2. A brief rationale (1-2 sentences)
### Dimensions to evaluate:
{dimensions_list}
Rate each dimension 15 (1 = poor, 5 = excellent). Provide a brief
rationale (12 sentences) for each score.
## Output format
Return your evaluation as a structured list:
DIMENSION: <name>
SCORE: <1-5>
RATIONALE: <explanation>
@@ -161,6 +176,9 @@ def run_entity_evaluation(
) -> BatchSummary:
"""Run per-entity evaluation using the batch evaluator.
Evaluation files are written **incrementally** after each successful
result, so a long run is resumable and safe to interrupt.
Args:
config: The infospace configuration.
entities: Entities to evaluate.
@@ -176,6 +194,25 @@ def run_entity_evaluation(
A :class:`BatchSummary` with per-entity results.
"""
topic = config.topic.name
evaluations_path = output_dir or Path(config.evaluations_dir)
evaluator_name = (run_config.model_name if run_config else "unknown")
def _write_and_notify(done: int, total: int, result) -> None:
# Write file immediately on success (incremental — run is resumable)
if result.status == "success" and result.response is not None:
scores = parse_evaluation_response(result.response.content, dimensions)
evaluation = EntityEvaluation(
entity_slug=result.key,
evaluator=evaluator_name,
scores=scores,
evaluated_at=datetime.utcnow(),
)
eval_path = evaluations_path / f"{result.key}.md"
write_entity_evaluation(evaluation, eval_path)
if progress_callback is not None:
progress_callback(done, total, result)
items = [
BatchItem(
key=entity.slug,
@@ -189,27 +226,7 @@ def run_entity_evaluation(
evaluator = BatchEvaluator(
adapter=adapter,
config=run_config,
progress_callback=progress_callback,
progress_callback=_write_and_notify,
previous_digests=previous_digests,
)
summary = evaluator.evaluate(items)
# Write successful results
evaluations_path = output_dir or Path(config.evaluations_dir)
evaluator_name = (run_config.model_name if run_config else "unknown")
for result in summary.results:
if result.status != "success" or result.response is None:
continue
scores = parse_evaluation_response(result.response.content, dimensions)
evaluation = EntityEvaluation(
entity_slug=result.key,
evaluator=evaluator_name,
scores=scores,
evaluated_at=datetime.utcnow(),
)
eval_path = evaluations_path / f"{result.key}.md"
write_entity_evaluation(evaluation, eval_path)
return summary
return evaluator.evaluate(items)

View File

@@ -131,8 +131,11 @@ def record_check_results(
metrics_dir = root / config.metrics_dir
metrics = check_report.metrics()
# Save latest metrics
write_metrics_file(metrics, metrics_dir / "metrics.yaml")
# Save latest metrics — merge with existing so other metric sources
# (e.g. per-entity evaluation summary) are preserved across check runs.
existing = read_metrics_file(metrics_dir / "metrics.yaml")
merged = {**existing, **metrics} # check results overwrite on key conflict
write_metrics_file(merged, metrics_dir / "metrics.yaml")
# Create and append snapshot
snapshot = snapshot_from_checks(