eval history and metrics

This commit is contained in:
2026-05-14 15:35:04 +02:00
parent d0c1f82863
commit 7f54dec585
9 changed files with 870 additions and 16 deletions

View File

@@ -20,6 +20,7 @@ Start with:
- `docs/markitect-main-scope-assessment.md`
- `docs/markitect-tool-adapter.md`
- `docs/entity-relation-model.md`
- `docs/evaluation-history-and-metrics.md`
- `docs/orthogonal-successor-roadmap.md`
- `docs/legacy-infospace-feature-inventory.md`
- `docs/successor-boundary-interface-map.md`

View File

@@ -0,0 +1,43 @@
# Evaluation History And Metrics
`infospace-bench` keeps evaluation history as committed, inspectable files under
each infospace root. This replaces the legacy `markitect-project` history
workflow while retaining the useful behaviors: Markdown evaluation files,
append-only snapshot history, metric merging, and viability checks.
## Files
- `output/evaluations/*.md`: per-artifact evaluation files with YAML
frontmatter and a human-readable Markdown body.
- `output/metrics/metrics.yaml`: latest merged metrics. Collection metrics,
evaluation-derived metrics, and structured non-numeric values are preserved.
- `output/metrics/history.yaml`: append-only list of evaluation snapshots.
- `output/metrics/snapshots/<snapshot-id>.yaml`: named snapshot copies for
reproducible diffs.
- `output/metrics/viability.yaml`: structured viability report generated from
`infospace.yaml` thresholds and the current metrics file.
## Replacement Mapping
The old infospace history code used entity-oriented names such as
`entity_count`, `entity_evaluations`, and `entity_slug`. The successor model
uses artifact-oriented names:
- `artifact_count` replaces `entity_count`
- `artifact_evaluations` replaces `entity_evaluations`
- `artifact_id` replaces `entity_slug`
Readers accept the old snapshot aliases where practical so legacy fixtures can
be inspected, but new files should use the artifact-oriented vocabulary.
## CLI
```bash
python3 -m infospace_bench metrics infospaces/bootstrap-pilot
python3 -m infospace_bench history infospaces/bootstrap-pilot
python3 -m infospace_bench history infospaces/bootstrap-pilot --metric coverage_ratio
python3 -m infospace_bench history-diff infospaces/bootstrap-pilot snap-a snap-b
```
Snapshot references may be exact snapshot IDs or ISO-like dates such as
`2026-05-14`. Date references resolve to the nearest snapshot in the history.

View File

@@ -1,5 +1,22 @@
from .errors import InfospaceError
from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue, ScoreEntry
from .evaluation_io import (
append_to_history,
read_entity_evaluation,
read_history,
read_snapshot,
write_entity_evaluation,
write_snapshot,
)
from .history import (
find_snapshot,
get_history,
get_latest_snapshot,
metric_trend,
read_metrics_file,
record_check_results,
write_metrics_file,
)
from .lifecycle import add_artifact, create_infospace, load_infospace
from .models import (
DisciplineBinding,
@@ -26,8 +43,21 @@ __all__ = [
"TopicConfig",
"ViabilityThreshold",
"add_artifact",
"append_to_history",
"create_infospace",
"find_snapshot",
"get_history",
"get_latest_snapshot",
"list_entities",
"list_relations",
"load_infospace",
"metric_trend",
"read_entity_evaluation",
"read_history",
"read_metrics_file",
"read_snapshot",
"record_check_results",
"write_entity_evaluation",
"write_metrics_file",
"write_snapshot",
]

View File

@@ -5,7 +5,9 @@ import json
import sys
from pathlib import Path
from .checks import run_collection_checks
from .errors import InfospaceError
from .history import find_snapshot, get_history, metric_trend, record_check_results
from .lifecycle import add_artifact, create_infospace, load_infospace
from .markdown_adapter import validate_infospace_artifacts
from .semantics import list_entities, list_relations
@@ -42,6 +44,24 @@ def build_parser() -> argparse.ArgumentParser:
relations = sub.add_parser("relations", help="List parsed relation artifacts")
relations.add_argument("root")
history = sub.add_parser("history", help="List evaluation snapshot history")
history.add_argument("root")
history.add_argument("--metric", default="")
history_diff = sub.add_parser(
"history-diff",
help="Diff two evaluation snapshots by snapshot ID or date",
)
history_diff.add_argument("root")
history_diff.add_argument("before")
history_diff.add_argument("after")
metrics = sub.add_parser(
"metrics",
help="Run collection checks and persist metrics/history",
)
metrics.add_argument("root")
return parser
@@ -96,6 +116,40 @@ def main(argv: list[str] | None = None) -> int:
]
}
)
elif args.command == "history":
history = get_history(Path(args.root))
if args.metric:
_write_json(
{
"metric": args.metric,
"trend": metric_trend(history, args.metric),
}
)
else:
_write_json({"history": [item.to_dict() for item in history]})
elif args.command == "history-diff":
history = get_history(Path(args.root))
before = find_snapshot(history, args.before)
after = find_snapshot(history, args.after)
if before is None or after is None:
missing = []
if before is None:
missing.append(args.before)
if after is None:
missing.append(args.after)
raise InfospaceError(
"missing_snapshot",
"Could not resolve requested snapshot reference",
{"missing_refs": missing},
)
_write_json({"diff": before.diff(after).to_dict()})
elif args.command == "metrics":
infospace = load_infospace(Path(args.root))
result = record_check_results(
infospace.root,
run_collection_checks(infospace.artifacts),
)
_write_json(result.to_dict())
else:
parser.error(f"Unhandled command: {args.command}")
except InfospaceError as exc:

View File

@@ -40,6 +40,11 @@ class EntityEvaluation:
evaluated_at: datetime
notes: list[str] = field(default_factory=list)
@property
def entity_slug(self) -> str:
"""Legacy alias for readers moving from entity-oriented history files."""
return self.artifact_id
@property
def overall_score(self) -> float:
if not self.scores:
@@ -102,6 +107,16 @@ class EvaluationSnapshot:
collection_metrics: list[MetricValue] = field(default_factory=list)
metadata: dict[str, Any] = field(default_factory=dict)
@property
def entity_count(self) -> int:
"""Legacy alias retained for old infospace history readers."""
return self.artifact_count
@property
def entity_evaluations(self) -> list[EntityEvaluation]:
"""Legacy alias retained for old infospace history readers."""
return self.artifact_evaluations
def to_dict(self) -> dict[str, Any]:
return {
"snapshot_id": self.snapshot_id,
@@ -122,11 +137,14 @@ class EvaluationSnapshot:
return cls(
snapshot_id=str(data["snapshot_id"]),
created_at=datetime.fromisoformat(str(data["created_at"])),
schema_name=str(data["schema_name"]),
artifact_count=int(data["artifact_count"]),
schema_name=str(data.get("schema_name") or "default"),
artifact_count=int(data.get("artifact_count", data.get("entity_count", 0))),
artifact_evaluations=[
EntityEvaluation.from_dict(item)
for item in data.get("artifact_evaluations", [])
for item in data.get(
"artifact_evaluations",
data.get("entity_evaluations", []),
)
],
collection_metrics=[
MetricValue.from_dict(item) for item in data.get("collection_metrics", [])
@@ -134,6 +152,9 @@ class EvaluationSnapshot:
metadata=dict(data.get("metadata") or {}),
)
def diff(self, after: "EvaluationSnapshot") -> "SnapshotDiff":
return diff_snapshots(self, after)
@dataclass(frozen=True)
class ScoreChange:
@@ -146,6 +167,20 @@ class ScoreChange:
def delta(self) -> float:
return self.after - self.before
@property
def entity_slug(self) -> str:
"""Legacy alias for old diff consumers."""
return self.artifact_id
def to_dict(self) -> dict[str, Any]:
return {
"artifact_id": self.artifact_id,
"dimension": self.dimension,
"before": self.before,
"after": self.after,
"delta": self.delta,
}
@dataclass(frozen=True)
class MetricChange:
@@ -157,6 +192,14 @@ class MetricChange:
def delta(self) -> float:
return self.after - self.before
def to_dict(self) -> dict[str, Any]:
return {
"name": self.name,
"before": self.before,
"after": self.after,
"delta": self.delta,
}
@dataclass(frozen=True)
class SnapshotDiff:
@@ -167,6 +210,51 @@ class SnapshotDiff:
score_changes: list[ScoreChange] = field(default_factory=list)
metric_changes: list[MetricChange] = field(default_factory=list)
@property
def added_entities(self) -> list[str]:
"""Legacy alias for old history diff output."""
return self.added_artifacts
@property
def removed_entities(self) -> list[str]:
"""Legacy alias for old history diff output."""
return self.removed_artifacts
def to_dict(self) -> dict[str, Any]:
return {
"before_id": self.before_id,
"after_id": self.after_id,
"added_artifacts": self.added_artifacts,
"removed_artifacts": self.removed_artifacts,
"score_changes": [change.to_dict() for change in self.score_changes],
"metric_changes": [change.to_dict() for change in self.metric_changes],
}
def summary(self) -> str:
lines = [f"Snapshot diff: {self.before_id} -> {self.after_id}"]
if not (
self.added_artifacts
or self.removed_artifacts
or self.score_changes
or self.metric_changes
):
return "\n".join([*lines, "No changes."])
for artifact_id in self.added_artifacts:
lines.append(f"Added artifact: {artifact_id}")
for artifact_id in self.removed_artifacts:
lines.append(f"Removed artifact: {artifact_id}")
for change in self.score_changes:
lines.append(
f"Score {change.artifact_id} {change.dimension}: "
f"{change.before} -> {change.after} ({change.delta:+.4f})"
)
for change in self.metric_changes:
lines.append(
f"Metric {change.name}: "
f"{change.before} -> {change.after} ({change.delta:+.4f})"
)
return "\n".join(lines)
def diff_snapshots(
before: EvaluationSnapshot,
@@ -174,22 +262,29 @@ def diff_snapshots(
) -> SnapshotDiff:
before_scores = _score_index(before)
after_scores = _score_index(after)
before_artifacts = {artifact_id for artifact_id, _ in before_scores}
after_artifacts = {artifact_id for artifact_id, _ in after_scores}
before_artifacts = {
evaluation.artifact_id for evaluation in before.artifact_evaluations
}
after_artifacts = {evaluation.artifact_id for evaluation in after.artifact_evaluations}
score_changes = [
ScoreChange(artifact_id, dimension, before_scores[key], after_scores[key])
for key in sorted(before_scores.keys() & after_scores.keys())
ScoreChange(
artifact_id,
dimension,
before_scores.get(key, 0.0),
after_scores.get(key, 0.0),
)
for key in sorted(before_scores.keys() | after_scores.keys())
for artifact_id, dimension in [key]
if before_scores[key] != after_scores[key]
if before_scores.get(key) != after_scores.get(key)
]
before_metrics = {metric.name: metric.value for metric in before.collection_metrics}
after_metrics = {metric.name: metric.value for metric in after.collection_metrics}
metric_changes = [
MetricChange(name, before_metrics[name], after_metrics[name])
for name in sorted(before_metrics.keys() & after_metrics.keys())
if before_metrics[name] != after_metrics[name]
MetricChange(name, before_metrics.get(name, 0.0), after_metrics.get(name, 0.0))
for name in sorted(before_metrics.keys() | after_metrics.keys())
if before_metrics.get(name) != after_metrics.get(name)
]
return SnapshotDiff(

View File

@@ -0,0 +1,181 @@
from __future__ import annotations
from datetime import datetime
from pathlib import Path
from typing import Any
import yaml
from .errors import InfospaceError
from .evaluation import (
EntityEvaluation,
EvaluationSnapshot,
ScoreEntry,
diff_snapshots,
)
FRONTMATTER_MARKER = "---"
def write_entity_evaluation(
evaluation: EntityEvaluation,
path: str | Path,
) -> None:
target = Path(path)
frontmatter: dict[str, Any] = {
"artifact_id": evaluation.artifact_id,
"evaluator": evaluation.evaluator,
"evaluated_at": evaluation.evaluated_at.isoformat(),
"overall_score": round(evaluation.overall_score, 4),
"scores": [score.to_dict() for score in evaluation.scores],
}
if evaluation.notes:
frontmatter["notes"] = evaluation.notes
lines = [
FRONTMATTER_MARKER,
yaml.safe_dump(frontmatter, sort_keys=False).rstrip(),
FRONTMATTER_MARKER,
"",
f"# Evaluation: {evaluation.artifact_id}",
"",
]
for score in evaluation.scores:
lines.append(f"## {score.name} - {score.value} / {score.max_value}")
lines.append("")
if score.rationale:
lines.append(score.rationale)
lines.append("")
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text("\n".join(lines), encoding="utf-8")
def read_entity_evaluation(path: str | Path) -> EntityEvaluation:
source = Path(path)
frontmatter, body = _read_frontmatter_markdown(source)
rationales = _parse_rationales(body)
scores = [
_score_with_body_rationale(ScoreEntry.from_dict(item), rationales)
for item in frontmatter.get("scores", [])
]
artifact_id = frontmatter.get("artifact_id", frontmatter.get("entity_slug"))
if not artifact_id:
raise InfospaceError(
"invalid_evaluation_file",
f"Missing artifact_id in evaluation file: {source}",
{"path": str(source)},
)
return EntityEvaluation(
artifact_id=str(artifact_id),
evaluator=str(frontmatter["evaluator"]),
scores=scores,
evaluated_at=datetime.fromisoformat(str(frontmatter["evaluated_at"])),
notes=list(frontmatter.get("notes") or []),
)
def write_snapshot(snapshot: EvaluationSnapshot, path: str | Path) -> None:
target = Path(path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(
yaml.safe_dump(snapshot.to_dict(), sort_keys=False),
encoding="utf-8",
)
def read_snapshot(path: str | Path) -> EvaluationSnapshot:
source = Path(path)
data = yaml.safe_load(source.read_text(encoding="utf-8"))
if not isinstance(data, dict):
raise InfospaceError(
"invalid_snapshot_file",
f"Expected mapping in snapshot file: {source}",
{"path": str(source)},
)
return EvaluationSnapshot.from_dict(data)
def append_to_history(snapshot: EvaluationSnapshot, history_path: str | Path) -> None:
target = Path(history_path)
existing = [item.to_dict() for item in read_history(target)] if target.exists() else []
existing.append(snapshot.to_dict())
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(
yaml.safe_dump(existing, sort_keys=False),
encoding="utf-8",
)
def read_history(history_path: str | Path) -> list[EvaluationSnapshot]:
source = Path(history_path)
if not source.is_file():
return []
data = yaml.safe_load(source.read_text(encoding="utf-8"))
if data is None:
return []
if not isinstance(data, list):
raise InfospaceError(
"invalid_history_file",
f"Expected list in history file: {source}",
{"path": str(source)},
)
return [EvaluationSnapshot.from_dict(item) for item in data]
def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]:
text = path.read_text(encoding="utf-8")
if not text.startswith(f"{FRONTMATTER_MARKER}\n"):
raise InfospaceError(
"invalid_evaluation_file",
f"Missing YAML frontmatter in evaluation file: {path}",
{"path": str(path)},
)
end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1)
if end == -1:
raise InfospaceError(
"invalid_evaluation_file",
f"Unclosed YAML frontmatter in evaluation file: {path}",
{"path": str(path)},
)
raw = text[len(FRONTMATTER_MARKER) + 1 : end]
body = text[end + len(FRONTMATTER_MARKER) + 2 :]
data = yaml.safe_load(raw)
if not isinstance(data, dict):
raise InfospaceError(
"invalid_evaluation_file",
f"Expected mapping frontmatter in evaluation file: {path}",
{"path": str(path)},
)
return data, body
def _parse_rationales(body: str) -> dict[str, str]:
rationales: dict[str, str] = {}
current_name: str | None = None
current_lines: list[str] = []
for line in body.splitlines():
if line.startswith("## "):
if current_name is not None:
rationales[current_name] = "\n".join(current_lines).strip()
heading = line[3:].strip()
current_name = heading.split(" - ", maxsplit=1)[0].strip()
current_lines = []
elif current_name is not None:
current_lines.append(line)
if current_name is not None:
rationales[current_name] = "\n".join(current_lines).strip()
return rationales
def _score_with_body_rationale(
score: ScoreEntry,
rationales: dict[str, str],
) -> ScoreEntry:
rationale = rationales.get(score.name, score.rationale)
return ScoreEntry(
name=score.name,
value=score.value,
max_value=score.max_value,
rationale=rationale,
)

View File

@@ -0,0 +1,254 @@
from __future__ import annotations
import uuid
from dataclasses import dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
import yaml
from .checks import CollectionCheckReport
from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue
from .evaluation_io import append_to_history, read_history, write_snapshot
from .lifecycle import load_infospace
from .viability import evaluate_viability
METRICS_PATH = Path("output/metrics/metrics.yaml")
HISTORY_PATH = Path("output/metrics/history.yaml")
VIABILITY_PATH = Path("output/metrics/viability.yaml")
SNAPSHOT_DIR = Path("output/metrics/snapshots")
@dataclass(frozen=True)
class RecordedCheckResult:
snapshot: EvaluationSnapshot
metrics: dict[str, Any]
viability: dict[str, Any] | None = None
def to_dict(self) -> dict[str, Any]:
return {
"snapshot": self.snapshot.to_dict(),
"metrics": self.metrics,
"viability": self.viability,
}
def snapshot_from_checks(
check_report: CollectionCheckReport,
artifact_count: int,
*,
schema_name: str = "default",
metadata: dict[str, Any] | None = None,
artifact_evaluations: list[EntityEvaluation] | None = None,
) -> EvaluationSnapshot:
metrics = _numeric_metrics(check_report.metrics)
collection_metrics = [
MetricValue(name=name, value=value, concern=_concern_for_metric(name))
for name, value in sorted(metrics.items())
]
collection_metrics.extend(
MetricValue(name=name, value=value, concern="evaluation")
for name, value in sorted(
_evaluation_metrics(artifact_evaluations or []).items()
)
)
return EvaluationSnapshot(
snapshot_id=str(uuid.uuid4())[:8],
created_at=datetime.now(timezone.utc),
schema_name=schema_name,
artifact_count=artifact_count,
artifact_evaluations=artifact_evaluations or [],
collection_metrics=collection_metrics,
metadata=metadata or {},
)
def write_metrics_file(metrics: dict[str, Any], path: str | Path) -> None:
target = Path(path)
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(
yaml.safe_dump(
{
key: _normalize_metric_value(value)
for key, value in sorted(metrics.items())
},
sort_keys=True,
),
encoding="utf-8",
)
def read_metrics_file(path: str | Path) -> dict[str, Any]:
source = Path(path)
if not source.is_file():
return {}
data = yaml.safe_load(source.read_text(encoding="utf-8"))
return data if isinstance(data, dict) else {}
def record_check_results(
root: str | Path,
check_report: CollectionCheckReport,
*,
artifact_evaluations: list[EntityEvaluation] | None = None,
schema_name: str = "default",
metadata: dict[str, Any] | None = None,
) -> RecordedCheckResult:
infospace = load_infospace(root)
artifact_count = int(
check_report.details.get("artifact_count", len(infospace.artifacts))
)
snapshot = snapshot_from_checks(
check_report,
artifact_count,
schema_name=schema_name,
metadata={"source": "collection-checks", **(metadata or {})},
artifact_evaluations=artifact_evaluations,
)
metrics_file = infospace.root / METRICS_PATH
existing = read_metrics_file(metrics_file)
merged = {
**existing,
**check_report.metrics,
**_evaluation_metrics(artifact_evaluations or []),
}
write_metrics_file(merged, metrics_file)
history_path = infospace.root / HISTORY_PATH
append_to_history(snapshot, history_path)
write_snapshot(
snapshot,
infospace.root / SNAPSHOT_DIR / f"{snapshot.snapshot_id}.yaml",
)
viability = build_viability_report(infospace.root, merged)
write_viability_report(infospace.root, viability)
return RecordedCheckResult(snapshot=snapshot, metrics=merged, viability=viability)
def get_history(root: str | Path) -> list[EvaluationSnapshot]:
return read_history(Path(root) / HISTORY_PATH)
def get_latest_snapshot(root: str | Path) -> EvaluationSnapshot | None:
history = get_history(root)
return history[-1] if history else None
def find_snapshot(
history: list[EvaluationSnapshot],
ref: str,
) -> EvaluationSnapshot | None:
for snapshot in history:
if snapshot.snapshot_id == ref:
return snapshot
return find_snapshot_by_date(history, ref)
def find_snapshot_by_date(
history: list[EvaluationSnapshot],
date_ref: str,
) -> EvaluationSnapshot | None:
if not history:
return None
try:
target = datetime.fromisoformat(
date_ref if "T" in date_ref else f"{date_ref}T00:00:00"
)
except ValueError:
return None
if target.tzinfo is None:
target = target.replace(tzinfo=timezone.utc)
def delta(snapshot: EvaluationSnapshot) -> float:
created_at = snapshot.created_at
if created_at.tzinfo is None:
created_at = created_at.replace(tzinfo=timezone.utc)
return abs((created_at - target).total_seconds())
return min(history, key=delta)
def metric_trend(
history: list[EvaluationSnapshot],
metric_name: str,
) -> list[dict[str, Any]]:
trend: list[dict[str, Any]] = []
for snapshot in history:
for metric in snapshot.collection_metrics:
if metric.name == metric_name:
trend.append(
{"date": snapshot.created_at.isoformat(), "value": metric.value}
)
break
return trend
def build_viability_report(
root: str | Path,
metrics: dict[str, Any] | None = None,
) -> dict[str, Any]:
infospace = load_infospace(root)
current = (
metrics
if metrics is not None
else read_metrics_file(infospace.root / METRICS_PATH)
)
numeric = _numeric_metrics(current)
report = evaluate_viability(numeric, infospace.config.viability)
return {
"passed": report.passed,
"results": {
name: {
"metric": result.metric,
"value": result.value,
"threshold": result.threshold.to_dict(),
"passed": result.passed,
}
for name, result in report.results.items()
},
}
def write_viability_report(root: str | Path, report: dict[str, Any]) -> None:
target = Path(root) / VIABILITY_PATH
target.parent.mkdir(parents=True, exist_ok=True)
target.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
def _evaluation_metrics(evaluations: list[EntityEvaluation]) -> dict[str, float | int]:
if not evaluations:
return {}
return {
"evaluated_artifact_count": len(evaluations),
"per_artifact_mean": sum(item.overall_score for item in evaluations)
/ len(evaluations),
}
def _numeric_metrics(metrics: dict[str, Any]) -> dict[str, float]:
return {
str(name): float(value)
for name, value in metrics.items()
if isinstance(value, (int, float)) and not isinstance(value, bool)
}
def _normalize_metric_value(value: Any) -> Any:
if isinstance(value, bool):
return value
if isinstance(value, float):
return round(value, 6)
return value
def _concern_for_metric(name: str) -> str:
mapping = {
"redundancy_ratio": "C1",
"coverage_ratio": "C2",
"coherence_components": "C3",
"consistency_cycles": "C4",
"granularity_entropy": "C5",
}
return mapping.get(name, "")

View File

@@ -0,0 +1,196 @@
import json
import os
import subprocess
import sys
from datetime import datetime, timezone
from pathlib import Path
import yaml
from infospace_bench import add_artifact, create_infospace, load_infospace
from infospace_bench.checks import run_collection_checks
from infospace_bench.evaluation import (
EntityEvaluation,
EvaluationSnapshot,
MetricValue,
ScoreEntry,
)
from infospace_bench.evaluation_io import (
append_to_history,
read_entity_evaluation,
read_history,
read_snapshot,
write_entity_evaluation,
write_snapshot,
)
from infospace_bench.history import (
find_snapshot,
get_history,
metric_trend,
read_metrics_file,
record_check_results,
write_metrics_file,
)
NOW = datetime(2026, 5, 14, 10, 30, tzinfo=timezone.utc)
def cli_env() -> dict[str, str]:
env = os.environ.copy()
env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src"
return env
def evaluation(
artifact_id: str = "entity/division.md",
value: float = 4.0,
) -> EntityEvaluation:
return EntityEvaluation(
artifact_id=artifact_id,
evaluator="test",
evaluated_at=NOW,
scores=[
ScoreEntry("definition_precision", value, rationale="Grounded and clear."),
ScoreEntry("source_grounding", 5.0),
],
notes=["reviewed"],
)
def snapshot(snapshot_id: str, value: float, day: int) -> EvaluationSnapshot:
return EvaluationSnapshot(
snapshot_id=snapshot_id,
created_at=datetime(2026, 5, day, 9, tzinfo=timezone.utc),
schema_name="baseline",
artifact_count=1,
artifact_evaluations=[evaluation(value=value)],
collection_metrics=[MetricValue("coverage_ratio", value / 5.0, concern="C2")],
metadata={"source": "test"},
)
def test_entity_evaluation_file_preserves_frontmatter_and_markdown_body(
tmp_path: Path,
) -> None:
path = tmp_path / "output" / "evaluations" / "division.md"
write_entity_evaluation(evaluation(), path)
restored = read_entity_evaluation(path)
text = path.read_text(encoding="utf-8")
assert text.startswith("---\n")
assert "artifact_id: entity/division.md" in text
assert "# Evaluation: entity/division.md" in text
assert "## definition_precision" in text
assert restored == evaluation()
assert restored.scores[0].rationale == "Grounded and clear."
def test_snapshot_and_history_round_trip_and_diff_from_named_refs(tmp_path: Path) -> None:
first = snapshot("snap-a", 3.0, 1)
second = snapshot("snap-b", 4.0, 2)
snapshot_path = tmp_path / "output" / "metrics" / "snap-a.yaml"
history_path = tmp_path / "output" / "metrics" / "history.yaml"
write_snapshot(first, snapshot_path)
append_to_history(first, history_path)
append_to_history(second, history_path)
history = read_history(history_path)
diff = find_snapshot(history, "snap-a").diff(find_snapshot(history, "2026-05-02"))
assert read_snapshot(snapshot_path) == first
assert [item.snapshot_id for item in history] == ["snap-a", "snap-b"]
assert get_history(tmp_path) == history
assert metric_trend(history, "coverage_ratio") == [
{"date": "2026-05-01T09:00:00+00:00", "value": 0.6},
{"date": "2026-05-02T09:00:00+00:00", "value": 0.8},
]
assert diff.score_changes[0].artifact_id == "entity/division.md"
assert diff.metric_changes[0].name == "coverage_ratio"
def test_metrics_file_preserves_structured_values_and_recording_merges(
tmp_path: Path,
) -> None:
infospace = create_infospace(tmp_path, "pilot", name="Pilot")
source = tmp_path / "chapter.md"
source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
add_artifact(infospace.root, source, kind="source", title="Chapter")
config_path = infospace.root / "infospace.yaml"
config = yaml.safe_load(config_path.read_text(encoding="utf-8"))
config["viability"] = {
"coverage_ratio": {"min": 0.5},
"redundancy_ratio": {"max": 0.1},
}
config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")
metrics_path = infospace.root / "output" / "metrics" / "metrics.yaml"
write_metrics_file(
{"type_distribution": {"source": 1}, "manual_metric": 7, "rounded": 1.1234567},
metrics_path,
)
report = record_check_results(
infospace.root,
run_collection_checks(load_infospace(infospace.root).artifacts),
artifact_evaluations=[evaluation()],
)
metrics = read_metrics_file(metrics_path)
assert metrics["type_distribution"] == {"source": 1}
assert metrics["manual_metric"] == 7
assert metrics["rounded"] == 1.123457
assert metrics["per_artifact_mean"] == 4.5
assert metrics["coverage_ratio"] == 1.0
assert report.viability is not None
assert report.viability["passed"] is True
def test_cli_history_history_diff_and_metrics_json(tmp_path: Path) -> None:
infospace = create_infospace(tmp_path, "pilot", name="Pilot")
source = tmp_path / "chapter.md"
source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
add_artifact(infospace.root, source, kind="source", title="Chapter")
history_path = infospace.root / "output" / "metrics" / "history.yaml"
append_to_history(snapshot("snap-a", 3.0, 1), history_path)
append_to_history(snapshot("snap-b", 4.0, 2), history_path)
history = subprocess.run(
[sys.executable, "-m", "infospace_bench", "history", str(infospace.root)],
check=False,
env=cli_env(),
text=True,
capture_output=True,
)
diff = subprocess.run(
[
sys.executable,
"-m",
"infospace_bench",
"history-diff",
str(infospace.root),
"snap-a",
"snap-b",
],
check=False,
env=cli_env(),
text=True,
capture_output=True,
)
metrics = subprocess.run(
[sys.executable, "-m", "infospace_bench", "metrics", str(infospace.root)],
check=False,
env=cli_env(),
text=True,
capture_output=True,
)
assert history.returncode == 0, history.stderr
assert diff.returncode == 0, diff.stderr
assert metrics.returncode == 0, metrics.stderr
assert json.loads(history.stdout)["history"][1]["snapshot_id"] == "snap-b"
assert json.loads(diff.stdout)["diff"]["after_id"] == "snap-b"
assert json.loads(metrics.stdout)["snapshot"]["artifact_count"] == 1

View File

@@ -4,7 +4,7 @@ type: workplan
title: "Evaluation History And Metrics Parity"
domain: markitect
repo: infospace-bench
status: planned
status: completed
owner: markitect
topic_slug: markitect
created: "2026-05-14"
@@ -26,7 +26,7 @@ infospace evaluation history and metrics behavior.
```task
id: IB-WP-0008-T01
status: todo
status: done
priority: high
state_hub_task_id: "95b48ad3-c4d1-442c-9bc0-7591d948d23e"
```
@@ -39,7 +39,7 @@ state_hub_task_id: "95b48ad3-c4d1-442c-9bc0-7591d948d23e"
```task
id: IB-WP-0008-T02
status: todo
status: done
priority: high
state_hub_task_id: "b4800ba8-5b86-44bb-bf47-e893bae36b22"
```
@@ -52,7 +52,7 @@ state_hub_task_id: "b4800ba8-5b86-44bb-bf47-e893bae36b22"
```task
id: IB-WP-0008-T03
status: todo
status: done
priority: high
state_hub_task_id: "7abcbd63-0147-4ae8-85f0-4af51882f476"
```
@@ -65,7 +65,7 @@ state_hub_task_id: "7abcbd63-0147-4ae8-85f0-4af51882f476"
```task
id: IB-WP-0008-T04
status: todo
status: done
priority: medium
state_hub_task_id: "675d1d45-39d9-4ddd-9ab7-5d7de8a0f601"
```