feat(infospace): add eval-summary command and improve evaluate pipeline (S3.3)
- Fix evaluate dimensions to match template file: definition_precision, source_grounding, domain_placement, vsm_relevance, explanatory_value (was domain_relevance, discipline_alignment, conceptual_clarity) - Add VSM background context to evaluation prompt so LLM can score vsm_relevance without macro injection - Fix model_name bug: was sending literal "default" to API (HTTP 400) - Refactor run_entity_evaluation to write files incrementally via callback rather than all at once after the batch — long runs are now resumable if interrupted - Add incremental skip in CLI: entities with existing eval files are skipped automatically on re-run (acts as resume) - Add eval-summary command: reads all eval files, shows per-dimension means, optionally writes per_entity_mean to metrics.yaml - Fix record_check_results to merge rather than overwrite metrics.yaml so per_entity_mean survives subsequent check runs - Add per_entity_mean viability threshold (min: 3.5) to infospace.yaml Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -37,6 +37,8 @@ viability:
|
|||||||
max: 0
|
max: 0
|
||||||
granularity_entropy:
|
granularity_entropy:
|
||||||
min: 1.0
|
min: 1.0
|
||||||
|
per_entity_mean:
|
||||||
|
min: 3.5 # LLM quality score across 5 dimensions (1-5 scale)
|
||||||
|
|
||||||
pipeline:
|
pipeline:
|
||||||
stages:
|
stages:
|
||||||
|
|||||||
@@ -934,3 +934,29 @@
|
|||||||
concern: C1
|
concern: C1
|
||||||
metadata:
|
metadata:
|
||||||
source: collection-checks
|
source: collection-checks
|
||||||
|
- snapshot_id: 090bb961
|
||||||
|
created_at: '2026-02-23T00:22:25.818146+00:00'
|
||||||
|
schema_name: default
|
||||||
|
entity_count: 988
|
||||||
|
entity_evaluations: []
|
||||||
|
collection_metrics:
|
||||||
|
- name: coherence_components
|
||||||
|
value: 0.0
|
||||||
|
concern: C3
|
||||||
|
- name: consistency_cycles
|
||||||
|
value: 0.0
|
||||||
|
concern: C4
|
||||||
|
- name: coverage_ratio
|
||||||
|
value: 0.6190476190476191
|
||||||
|
concern: C2
|
||||||
|
- name: granularity_entropy
|
||||||
|
value: 2.6747519428200657
|
||||||
|
concern: C5
|
||||||
|
- name: modularity
|
||||||
|
value: 0.0
|
||||||
|
concern: C3
|
||||||
|
- name: redundancy_ratio
|
||||||
|
value: 0.006072874493927126
|
||||||
|
concern: C1
|
||||||
|
metadata:
|
||||||
|
source: collection-checks
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
coherence_components: 0.0
|
coherence_components: 0.0
|
||||||
consistency_cycles: 0.0
|
consistency_cycles: 0.0
|
||||||
coverage_ratio: 0.442424
|
coverage_ratio: 0.619048
|
||||||
granularity_entropy: 2.953326
|
granularity_entropy: 2.674752
|
||||||
modularity: 0.0
|
modularity: 0.0
|
||||||
redundancy_ratio: 0.005877
|
per_entity_mean: 4.42
|
||||||
|
redundancy_ratio: 0.006073
|
||||||
|
|||||||
70
examples/infospace-with-history/templates/evaluate-entity.md
Normal file
70
examples/infospace-with-history/templates/evaluate-entity.md
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# Evaluate Economic Entity
|
||||||
|
|
||||||
|
You are a quality assessor evaluating a single economic entity extracted from
|
||||||
|
Adam Smith's *The Wealth of Nations* and mapped to Stafford Beer's Viable
|
||||||
|
System Model. Your task is to score the entity on five quality dimensions
|
||||||
|
and produce a structured evaluation.
|
||||||
|
|
||||||
|
## Entity Under Evaluation
|
||||||
|
|
||||||
|
@{entity_content}
|
||||||
|
|
||||||
|
## Source Chapter
|
||||||
|
|
||||||
|
@{source_chapter}
|
||||||
|
|
||||||
|
## VSM Framework Reference
|
||||||
|
|
||||||
|
@{vsm_framework}
|
||||||
|
|
||||||
|
## Quality Rubric
|
||||||
|
|
||||||
|
@{quality_rubric}
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
|
||||||
|
1. Read the entity carefully, including its definition, source chapter,
|
||||||
|
context, economic domain, and any VSM mapping information provided.
|
||||||
|
2. Locate the relevant passage in the source chapter to verify source grounding.
|
||||||
|
3. Consult the VSM framework reference to assess VSM relevance.
|
||||||
|
4. Score each dimension 1–5 using the rubric above. Use the full range:
|
||||||
|
reserve 5 for genuinely excellent entries and 1 for clear failures.
|
||||||
|
5. For each dimension, write exactly one sentence justifying the score.
|
||||||
|
6. Compute the overall score as the mean of the five dimension scores,
|
||||||
|
rounded to two decimal places.
|
||||||
|
7. List any flags for issues that warrant attention (empty list if none).
|
||||||
|
Valid flags: `circular-definition`, `missing-citation`, `wrong-domain`,
|
||||||
|
`no-vsm-mapping`, `redundant-with-<slug>`, `overclaimed-strength`,
|
||||||
|
`underclaimed-strength`.
|
||||||
|
|
||||||
|
## Output Format
|
||||||
|
|
||||||
|
Output YAML front-matter (scores + flags) followed by a markdown section
|
||||||
|
with per-dimension justifications. Do not include any other text outside
|
||||||
|
this structure.
|
||||||
|
|
||||||
|
```
|
||||||
|
---
|
||||||
|
entity: <slug of the entity, kebab-case>
|
||||||
|
scores:
|
||||||
|
definition_precision: <1-5>
|
||||||
|
source_grounding: <1-5>
|
||||||
|
domain_placement: <1-5>
|
||||||
|
vsm_relevance: <1-5>
|
||||||
|
explanatory_value: <1-5>
|
||||||
|
overall: <mean rounded to 2 decimal places>
|
||||||
|
flags: []
|
||||||
|
---
|
||||||
|
|
||||||
|
## Justifications
|
||||||
|
|
||||||
|
**Definition Precision (<score>/5):** <one sentence>
|
||||||
|
|
||||||
|
**Source Grounding (<score>/5):** <one sentence>
|
||||||
|
|
||||||
|
**Domain Placement (<score>/5):** <one sentence>
|
||||||
|
|
||||||
|
**VSM Relevance (<score>/5):** <one sentence>
|
||||||
|
|
||||||
|
**Explanatory Value (<score>/5):** <one sentence>
|
||||||
|
```
|
||||||
@@ -189,11 +189,26 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
|||||||
click.echo(f"No entities found for chapter '{chapter}'.")
|
click.echo(f"No entities found for chapter '{chapter}'.")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# Skip entities that already have evaluation files (incremental resume)
|
||||||
|
from markitect.infospace.evaluate import run_entity_evaluation
|
||||||
|
output_dir = root / cfg.evaluations_dir
|
||||||
|
if not entity_slug and not chapter and output_dir.is_dir():
|
||||||
|
previous_digests = {
|
||||||
|
p.stem: "" # non-empty sentinel → triggers skip in BatchEvaluator
|
||||||
|
for p in output_dir.glob("*.md")
|
||||||
|
}
|
||||||
|
entity_list = [e for e in entity_list if e.slug not in previous_digests]
|
||||||
|
if not entity_list:
|
||||||
|
click.echo("All entities already evaluated. Nothing to do.")
|
||||||
|
return
|
||||||
|
if previous_digests:
|
||||||
|
click.echo(f"Skipping {len(previous_digests)} already-evaluated entities.")
|
||||||
|
|
||||||
# Create adapter
|
# Create adapter
|
||||||
from markitect.llm import create_adapter
|
from markitect.llm import create_adapter
|
||||||
from markitect.prompts.execution.models import RunConfig
|
from markitect.prompts.execution.models import RunConfig
|
||||||
adapter = create_adapter(provider, model=model)
|
adapter = create_adapter(provider, model=model)
|
||||||
run_config = RunConfig(model_name=model or "default", temperature=0.3, max_tokens=2000)
|
run_config = RunConfig(model_name=model, temperature=0.3, max_tokens=2000)
|
||||||
|
|
||||||
# Progress callback
|
# Progress callback
|
||||||
def on_progress(done, total, result):
|
def on_progress(done, total, result):
|
||||||
@@ -202,8 +217,6 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
|||||||
|
|
||||||
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
|
click.echo(f"Evaluating {len(entity_list)} entities via {provider}...")
|
||||||
|
|
||||||
from markitect.infospace.evaluate import run_entity_evaluation
|
|
||||||
output_dir = root / cfg.evaluations_dir
|
|
||||||
summary = run_entity_evaluation(
|
summary = run_entity_evaluation(
|
||||||
config=cfg,
|
config=cfg,
|
||||||
entities=entity_list,
|
entities=entity_list,
|
||||||
@@ -218,6 +231,74 @@ def evaluate(config_path, provider, model, entity_slug, chapter):
|
|||||||
click.echo(f"Tokens used: {summary.total_tokens}")
|
click.echo(f"Tokens used: {summary.total_tokens}")
|
||||||
|
|
||||||
|
|
||||||
|
# ── eval-summary ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@infospace_commands.command(name="eval-summary")
|
||||||
|
@click.option("--config", "config_path", default=None, help="Path to infospace.yaml.")
|
||||||
|
@click.option("--update-metrics", is_flag=True, default=False,
|
||||||
|
help="Merge per_entity_mean into metrics.yaml for viability checks.")
|
||||||
|
def eval_summary(config_path: Optional[str], update_metrics: bool):
|
||||||
|
"""Show aggregate statistics from per-entity evaluation files."""
|
||||||
|
cfg, cfg_path = _load_config_or_exit(config_path)
|
||||||
|
root = cfg_path.parent
|
||||||
|
|
||||||
|
evaluations_dir = root / cfg.evaluations_dir
|
||||||
|
if not evaluations_dir.is_dir():
|
||||||
|
click.echo("No evaluations directory found. Run 'markitect infospace evaluate' first.")
|
||||||
|
return
|
||||||
|
|
||||||
|
from markitect.infospace.evaluation_io import read_entity_evaluation
|
||||||
|
|
||||||
|
eval_files = sorted(evaluations_dir.glob("*.md"))
|
||||||
|
if not eval_files:
|
||||||
|
click.echo("No evaluation files found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
overall_scores: list = []
|
||||||
|
dim_scores: dict = {}
|
||||||
|
failed: list = []
|
||||||
|
|
||||||
|
for ef in eval_files:
|
||||||
|
try:
|
||||||
|
ev = read_entity_evaluation(ef)
|
||||||
|
overall_scores.append(ev.overall_score)
|
||||||
|
for s in ev.scores:
|
||||||
|
dim_scores.setdefault(s.name, []).append(s.value)
|
||||||
|
except Exception as exc:
|
||||||
|
failed.append((ef.stem, str(exc)))
|
||||||
|
|
||||||
|
n = len(overall_scores)
|
||||||
|
if n == 0:
|
||||||
|
click.echo("No evaluations could be read.")
|
||||||
|
return
|
||||||
|
|
||||||
|
mean_overall = sum(overall_scores) / n
|
||||||
|
|
||||||
|
click.echo(f"Evaluation summary — {n} entities evaluated")
|
||||||
|
if failed:
|
||||||
|
click.echo(f" (failed to read: {len(failed)})")
|
||||||
|
click.echo()
|
||||||
|
click.echo(f" {'Dimension':<30} {'Mean':>6}")
|
||||||
|
click.echo(" " + "-" * 38)
|
||||||
|
click.echo(f" {'overall':<30} {mean_overall:>6.3f}")
|
||||||
|
for dim, vals in sorted(dim_scores.items()):
|
||||||
|
click.echo(f" {dim:<30} {sum(vals)/len(vals):>6.3f}")
|
||||||
|
|
||||||
|
score_min = min(overall_scores)
|
||||||
|
score_max = max(overall_scores)
|
||||||
|
click.echo()
|
||||||
|
click.echo(f" Range: {score_min:.2f} – {score_max:.2f}")
|
||||||
|
|
||||||
|
if update_metrics:
|
||||||
|
from markitect.infospace.history import read_metrics_file, write_metrics_file
|
||||||
|
metrics_file = root / cfg.metrics_dir / "metrics.yaml"
|
||||||
|
existing = read_metrics_file(metrics_file)
|
||||||
|
existing["per_entity_mean"] = round(mean_overall, 6)
|
||||||
|
write_metrics_file(existing, metrics_file)
|
||||||
|
click.echo(f"\nUpdated metrics.yaml: per_entity_mean = {mean_overall:.4f}")
|
||||||
|
|
||||||
|
|
||||||
# ── viability ────────────────────────────────────────────────────────
|
# ── viability ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -25,9 +25,9 @@ from markitect.prompts.execution.models import RunConfig
|
|||||||
_DEFAULT_DIMENSIONS = [
|
_DEFAULT_DIMENSIONS = [
|
||||||
"definition_precision",
|
"definition_precision",
|
||||||
"source_grounding",
|
"source_grounding",
|
||||||
"domain_relevance",
|
"domain_placement",
|
||||||
"discipline_alignment",
|
"vsm_relevance",
|
||||||
"conceptual_clarity",
|
"explanatory_value",
|
||||||
]
|
]
|
||||||
|
|
||||||
_PROMPT_TEMPLATE = """\
|
_PROMPT_TEMPLATE = """\
|
||||||
@@ -45,20 +45,35 @@ You are evaluating an entity from an infospace about "{topic}".
|
|||||||
### Context
|
### Context
|
||||||
{context}
|
{context}
|
||||||
|
|
||||||
|
## Background
|
||||||
|
|
||||||
|
This infospace maps concepts from the source corpus to Stafford Beer's
|
||||||
|
Viable System Model (VSM). The VSM has five systems: S1 (primary operations),
|
||||||
|
S2 (coordination/anti-oscillation), S3 (internal regulation/audit),
|
||||||
|
S4 (intelligence/environmental adaptation), S5 (identity/policy). Use this
|
||||||
|
to assess whether the entity has a natural VSM home.
|
||||||
|
|
||||||
|
## Dimensions
|
||||||
|
|
||||||
|
- **definition_precision**: Is the definition precise and non-circular? \
|
||||||
|
Does it capture a distinct concept rather than a vague umbrella term?
|
||||||
|
- **source_grounding**: Is this entity grounded in the actual source text, \
|
||||||
|
or does it introduce concepts the source does not clearly state?
|
||||||
|
- **domain_placement**: Is the economic/thematic domain assignment correct? \
|
||||||
|
Does the entity belong in a different conceptual category?
|
||||||
|
- **vsm_relevance**: Does this entity map naturally to one or more VSM \
|
||||||
|
systems (S1–S5), or is it VSM-neutral/too abstract to place?
|
||||||
|
- **explanatory_value**: Does this entity add genuine explanatory power — \
|
||||||
|
illuminating a mechanism or structural relation — or does it merely name \
|
||||||
|
a surface phenomenon?
|
||||||
|
|
||||||
## Instructions
|
## Instructions
|
||||||
|
|
||||||
Rate this entity on each dimension below using a scale of 1-5 \
|
Rate each dimension 1–5 (1 = poor, 5 = excellent). Provide a brief
|
||||||
(1 = poor, 5 = excellent). For each dimension, provide:
|
rationale (1–2 sentences) for each score.
|
||||||
1. A numeric score (1-5)
|
|
||||||
2. A brief rationale (1-2 sentences)
|
|
||||||
|
|
||||||
### Dimensions to evaluate:
|
|
||||||
{dimensions_list}
|
|
||||||
|
|
||||||
## Output format
|
## Output format
|
||||||
|
|
||||||
Return your evaluation as a structured list:
|
|
||||||
|
|
||||||
DIMENSION: <name>
|
DIMENSION: <name>
|
||||||
SCORE: <1-5>
|
SCORE: <1-5>
|
||||||
RATIONALE: <explanation>
|
RATIONALE: <explanation>
|
||||||
@@ -161,6 +176,9 @@ def run_entity_evaluation(
|
|||||||
) -> BatchSummary:
|
) -> BatchSummary:
|
||||||
"""Run per-entity evaluation using the batch evaluator.
|
"""Run per-entity evaluation using the batch evaluator.
|
||||||
|
|
||||||
|
Evaluation files are written **incrementally** after each successful
|
||||||
|
result, so a long run is resumable and safe to interrupt.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
config: The infospace configuration.
|
config: The infospace configuration.
|
||||||
entities: Entities to evaluate.
|
entities: Entities to evaluate.
|
||||||
@@ -176,6 +194,25 @@ def run_entity_evaluation(
|
|||||||
A :class:`BatchSummary` with per-entity results.
|
A :class:`BatchSummary` with per-entity results.
|
||||||
"""
|
"""
|
||||||
topic = config.topic.name
|
topic = config.topic.name
|
||||||
|
evaluations_path = output_dir or Path(config.evaluations_dir)
|
||||||
|
evaluator_name = (run_config.model_name if run_config else "unknown")
|
||||||
|
|
||||||
|
def _write_and_notify(done: int, total: int, result) -> None:
|
||||||
|
# Write file immediately on success (incremental — run is resumable)
|
||||||
|
if result.status == "success" and result.response is not None:
|
||||||
|
scores = parse_evaluation_response(result.response.content, dimensions)
|
||||||
|
evaluation = EntityEvaluation(
|
||||||
|
entity_slug=result.key,
|
||||||
|
evaluator=evaluator_name,
|
||||||
|
scores=scores,
|
||||||
|
evaluated_at=datetime.utcnow(),
|
||||||
|
)
|
||||||
|
eval_path = evaluations_path / f"{result.key}.md"
|
||||||
|
write_entity_evaluation(evaluation, eval_path)
|
||||||
|
|
||||||
|
if progress_callback is not None:
|
||||||
|
progress_callback(done, total, result)
|
||||||
|
|
||||||
items = [
|
items = [
|
||||||
BatchItem(
|
BatchItem(
|
||||||
key=entity.slug,
|
key=entity.slug,
|
||||||
@@ -189,27 +226,7 @@ def run_entity_evaluation(
|
|||||||
evaluator = BatchEvaluator(
|
evaluator = BatchEvaluator(
|
||||||
adapter=adapter,
|
adapter=adapter,
|
||||||
config=run_config,
|
config=run_config,
|
||||||
progress_callback=progress_callback,
|
progress_callback=_write_and_notify,
|
||||||
previous_digests=previous_digests,
|
previous_digests=previous_digests,
|
||||||
)
|
)
|
||||||
summary = evaluator.evaluate(items)
|
return evaluator.evaluate(items)
|
||||||
|
|
||||||
# Write successful results
|
|
||||||
evaluations_path = output_dir or Path(config.evaluations_dir)
|
|
||||||
evaluator_name = (run_config.model_name if run_config else "unknown")
|
|
||||||
|
|
||||||
for result in summary.results:
|
|
||||||
if result.status != "success" or result.response is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
scores = parse_evaluation_response(result.response.content, dimensions)
|
|
||||||
evaluation = EntityEvaluation(
|
|
||||||
entity_slug=result.key,
|
|
||||||
evaluator=evaluator_name,
|
|
||||||
scores=scores,
|
|
||||||
evaluated_at=datetime.utcnow(),
|
|
||||||
)
|
|
||||||
eval_path = evaluations_path / f"{result.key}.md"
|
|
||||||
write_entity_evaluation(evaluation, eval_path)
|
|
||||||
|
|
||||||
return summary
|
|
||||||
|
|||||||
@@ -131,8 +131,11 @@ def record_check_results(
|
|||||||
metrics_dir = root / config.metrics_dir
|
metrics_dir = root / config.metrics_dir
|
||||||
metrics = check_report.metrics()
|
metrics = check_report.metrics()
|
||||||
|
|
||||||
# Save latest metrics
|
# Save latest metrics — merge with existing so other metric sources
|
||||||
write_metrics_file(metrics, metrics_dir / "metrics.yaml")
|
# (e.g. per-entity evaluation summary) are preserved across check runs.
|
||||||
|
existing = read_metrics_file(metrics_dir / "metrics.yaml")
|
||||||
|
merged = {**existing, **metrics} # check results overwrite on key conflict
|
||||||
|
write_metrics_file(merged, metrics_dir / "metrics.yaml")
|
||||||
|
|
||||||
# Create and append snapshot
|
# Create and append snapshot
|
||||||
snapshot = snapshot_from_checks(
|
snapshot = snapshot_from_checks(
|
||||||
|
|||||||
Reference in New Issue
Block a user