From 7f54dec585d844e22eadda69119f02241ad893e6 Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 14 May 2026 15:35:04 +0200 Subject: [PATCH] eval history and metrics --- README.md | 1 + docs/evaluation-history-and-metrics.md | 43 +++ src/infospace_bench/__init__.py | 30 +++ src/infospace_bench/cli.py | 54 ++++ src/infospace_bench/evaluation.py | 117 +++++++- src/infospace_bench/evaluation_io.py | 181 +++++++++++++ src/infospace_bench/history.py | 254 ++++++++++++++++++ tests/test_evaluation_history.py | 196 ++++++++++++++ ...-0008-evaluation-history-metrics-parity.md | 10 +- 9 files changed, 870 insertions(+), 16 deletions(-) create mode 100644 docs/evaluation-history-and-metrics.md create mode 100644 src/infospace_bench/evaluation_io.py create mode 100644 src/infospace_bench/history.py create mode 100644 tests/test_evaluation_history.py diff --git a/README.md b/README.md index f9bd9ba..af1a13f 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Start with: - `docs/markitect-main-scope-assessment.md` - `docs/markitect-tool-adapter.md` - `docs/entity-relation-model.md` +- `docs/evaluation-history-and-metrics.md` - `docs/orthogonal-successor-roadmap.md` - `docs/legacy-infospace-feature-inventory.md` - `docs/successor-boundary-interface-map.md` diff --git a/docs/evaluation-history-and-metrics.md b/docs/evaluation-history-and-metrics.md new file mode 100644 index 0000000..ac25c0d --- /dev/null +++ b/docs/evaluation-history-and-metrics.md @@ -0,0 +1,43 @@ +# Evaluation History And Metrics + +`infospace-bench` keeps evaluation history as committed, inspectable files under +each infospace root. This replaces the legacy `markitect-project` history +workflow while retaining the useful behaviors: Markdown evaluation files, +append-only snapshot history, metric merging, and viability checks. + +## Files + +- `output/evaluations/*.md`: per-artifact evaluation files with YAML + frontmatter and a human-readable Markdown body. +- `output/metrics/metrics.yaml`: latest merged metrics. Collection metrics, + evaluation-derived metrics, and structured non-numeric values are preserved. +- `output/metrics/history.yaml`: append-only list of evaluation snapshots. +- `output/metrics/snapshots/.yaml`: named snapshot copies for + reproducible diffs. +- `output/metrics/viability.yaml`: structured viability report generated from + `infospace.yaml` thresholds and the current metrics file. + +## Replacement Mapping + +The old infospace history code used entity-oriented names such as +`entity_count`, `entity_evaluations`, and `entity_slug`. The successor model +uses artifact-oriented names: + +- `artifact_count` replaces `entity_count` +- `artifact_evaluations` replaces `entity_evaluations` +- `artifact_id` replaces `entity_slug` + +Readers accept the old snapshot aliases where practical so legacy fixtures can +be inspected, but new files should use the artifact-oriented vocabulary. + +## CLI + +```bash +python3 -m infospace_bench metrics infospaces/bootstrap-pilot +python3 -m infospace_bench history infospaces/bootstrap-pilot +python3 -m infospace_bench history infospaces/bootstrap-pilot --metric coverage_ratio +python3 -m infospace_bench history-diff infospaces/bootstrap-pilot snap-a snap-b +``` + +Snapshot references may be exact snapshot IDs or ISO-like dates such as +`2026-05-14`. Date references resolve to the nearest snapshot in the history. diff --git a/src/infospace_bench/__init__.py b/src/infospace_bench/__init__.py index 9a8ffbf..5523211 100644 --- a/src/infospace_bench/__init__.py +++ b/src/infospace_bench/__init__.py @@ -1,5 +1,22 @@ from .errors import InfospaceError from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue, ScoreEntry +from .evaluation_io import ( + append_to_history, + read_entity_evaluation, + read_history, + read_snapshot, + write_entity_evaluation, + write_snapshot, +) +from .history import ( + find_snapshot, + get_history, + get_latest_snapshot, + metric_trend, + read_metrics_file, + record_check_results, + write_metrics_file, +) from .lifecycle import add_artifact, create_infospace, load_infospace from .models import ( DisciplineBinding, @@ -26,8 +43,21 @@ __all__ = [ "TopicConfig", "ViabilityThreshold", "add_artifact", + "append_to_history", "create_infospace", + "find_snapshot", + "get_history", + "get_latest_snapshot", "list_entities", "list_relations", "load_infospace", + "metric_trend", + "read_entity_evaluation", + "read_history", + "read_metrics_file", + "read_snapshot", + "record_check_results", + "write_entity_evaluation", + "write_metrics_file", + "write_snapshot", ] diff --git a/src/infospace_bench/cli.py b/src/infospace_bench/cli.py index 7f47df8..eb1f247 100644 --- a/src/infospace_bench/cli.py +++ b/src/infospace_bench/cli.py @@ -5,7 +5,9 @@ import json import sys from pathlib import Path +from .checks import run_collection_checks from .errors import InfospaceError +from .history import find_snapshot, get_history, metric_trend, record_check_results from .lifecycle import add_artifact, create_infospace, load_infospace from .markdown_adapter import validate_infospace_artifacts from .semantics import list_entities, list_relations @@ -42,6 +44,24 @@ def build_parser() -> argparse.ArgumentParser: relations = sub.add_parser("relations", help="List parsed relation artifacts") relations.add_argument("root") + history = sub.add_parser("history", help="List evaluation snapshot history") + history.add_argument("root") + history.add_argument("--metric", default="") + + history_diff = sub.add_parser( + "history-diff", + help="Diff two evaluation snapshots by snapshot ID or date", + ) + history_diff.add_argument("root") + history_diff.add_argument("before") + history_diff.add_argument("after") + + metrics = sub.add_parser( + "metrics", + help="Run collection checks and persist metrics/history", + ) + metrics.add_argument("root") + return parser @@ -96,6 +116,40 @@ def main(argv: list[str] | None = None) -> int: ] } ) + elif args.command == "history": + history = get_history(Path(args.root)) + if args.metric: + _write_json( + { + "metric": args.metric, + "trend": metric_trend(history, args.metric), + } + ) + else: + _write_json({"history": [item.to_dict() for item in history]}) + elif args.command == "history-diff": + history = get_history(Path(args.root)) + before = find_snapshot(history, args.before) + after = find_snapshot(history, args.after) + if before is None or after is None: + missing = [] + if before is None: + missing.append(args.before) + if after is None: + missing.append(args.after) + raise InfospaceError( + "missing_snapshot", + "Could not resolve requested snapshot reference", + {"missing_refs": missing}, + ) + _write_json({"diff": before.diff(after).to_dict()}) + elif args.command == "metrics": + infospace = load_infospace(Path(args.root)) + result = record_check_results( + infospace.root, + run_collection_checks(infospace.artifacts), + ) + _write_json(result.to_dict()) else: parser.error(f"Unhandled command: {args.command}") except InfospaceError as exc: diff --git a/src/infospace_bench/evaluation.py b/src/infospace_bench/evaluation.py index 11ed2d7..0149a9b 100644 --- a/src/infospace_bench/evaluation.py +++ b/src/infospace_bench/evaluation.py @@ -40,6 +40,11 @@ class EntityEvaluation: evaluated_at: datetime notes: list[str] = field(default_factory=list) + @property + def entity_slug(self) -> str: + """Legacy alias for readers moving from entity-oriented history files.""" + return self.artifact_id + @property def overall_score(self) -> float: if not self.scores: @@ -102,6 +107,16 @@ class EvaluationSnapshot: collection_metrics: list[MetricValue] = field(default_factory=list) metadata: dict[str, Any] = field(default_factory=dict) + @property + def entity_count(self) -> int: + """Legacy alias retained for old infospace history readers.""" + return self.artifact_count + + @property + def entity_evaluations(self) -> list[EntityEvaluation]: + """Legacy alias retained for old infospace history readers.""" + return self.artifact_evaluations + def to_dict(self) -> dict[str, Any]: return { "snapshot_id": self.snapshot_id, @@ -122,11 +137,14 @@ class EvaluationSnapshot: return cls( snapshot_id=str(data["snapshot_id"]), created_at=datetime.fromisoformat(str(data["created_at"])), - schema_name=str(data["schema_name"]), - artifact_count=int(data["artifact_count"]), + schema_name=str(data.get("schema_name") or "default"), + artifact_count=int(data.get("artifact_count", data.get("entity_count", 0))), artifact_evaluations=[ EntityEvaluation.from_dict(item) - for item in data.get("artifact_evaluations", []) + for item in data.get( + "artifact_evaluations", + data.get("entity_evaluations", []), + ) ], collection_metrics=[ MetricValue.from_dict(item) for item in data.get("collection_metrics", []) @@ -134,6 +152,9 @@ class EvaluationSnapshot: metadata=dict(data.get("metadata") or {}), ) + def diff(self, after: "EvaluationSnapshot") -> "SnapshotDiff": + return diff_snapshots(self, after) + @dataclass(frozen=True) class ScoreChange: @@ -146,6 +167,20 @@ class ScoreChange: def delta(self) -> float: return self.after - self.before + @property + def entity_slug(self) -> str: + """Legacy alias for old diff consumers.""" + return self.artifact_id + + def to_dict(self) -> dict[str, Any]: + return { + "artifact_id": self.artifact_id, + "dimension": self.dimension, + "before": self.before, + "after": self.after, + "delta": self.delta, + } + @dataclass(frozen=True) class MetricChange: @@ -157,6 +192,14 @@ class MetricChange: def delta(self) -> float: return self.after - self.before + def to_dict(self) -> dict[str, Any]: + return { + "name": self.name, + "before": self.before, + "after": self.after, + "delta": self.delta, + } + @dataclass(frozen=True) class SnapshotDiff: @@ -167,6 +210,51 @@ class SnapshotDiff: score_changes: list[ScoreChange] = field(default_factory=list) metric_changes: list[MetricChange] = field(default_factory=list) + @property + def added_entities(self) -> list[str]: + """Legacy alias for old history diff output.""" + return self.added_artifacts + + @property + def removed_entities(self) -> list[str]: + """Legacy alias for old history diff output.""" + return self.removed_artifacts + + def to_dict(self) -> dict[str, Any]: + return { + "before_id": self.before_id, + "after_id": self.after_id, + "added_artifacts": self.added_artifacts, + "removed_artifacts": self.removed_artifacts, + "score_changes": [change.to_dict() for change in self.score_changes], + "metric_changes": [change.to_dict() for change in self.metric_changes], + } + + def summary(self) -> str: + lines = [f"Snapshot diff: {self.before_id} -> {self.after_id}"] + if not ( + self.added_artifacts + or self.removed_artifacts + or self.score_changes + or self.metric_changes + ): + return "\n".join([*lines, "No changes."]) + for artifact_id in self.added_artifacts: + lines.append(f"Added artifact: {artifact_id}") + for artifact_id in self.removed_artifacts: + lines.append(f"Removed artifact: {artifact_id}") + for change in self.score_changes: + lines.append( + f"Score {change.artifact_id} {change.dimension}: " + f"{change.before} -> {change.after} ({change.delta:+.4f})" + ) + for change in self.metric_changes: + lines.append( + f"Metric {change.name}: " + f"{change.before} -> {change.after} ({change.delta:+.4f})" + ) + return "\n".join(lines) + def diff_snapshots( before: EvaluationSnapshot, @@ -174,22 +262,29 @@ def diff_snapshots( ) -> SnapshotDiff: before_scores = _score_index(before) after_scores = _score_index(after) - before_artifacts = {artifact_id for artifact_id, _ in before_scores} - after_artifacts = {artifact_id for artifact_id, _ in after_scores} + before_artifacts = { + evaluation.artifact_id for evaluation in before.artifact_evaluations + } + after_artifacts = {evaluation.artifact_id for evaluation in after.artifact_evaluations} score_changes = [ - ScoreChange(artifact_id, dimension, before_scores[key], after_scores[key]) - for key in sorted(before_scores.keys() & after_scores.keys()) + ScoreChange( + artifact_id, + dimension, + before_scores.get(key, 0.0), + after_scores.get(key, 0.0), + ) + for key in sorted(before_scores.keys() | after_scores.keys()) for artifact_id, dimension in [key] - if before_scores[key] != after_scores[key] + if before_scores.get(key) != after_scores.get(key) ] before_metrics = {metric.name: metric.value for metric in before.collection_metrics} after_metrics = {metric.name: metric.value for metric in after.collection_metrics} metric_changes = [ - MetricChange(name, before_metrics[name], after_metrics[name]) - for name in sorted(before_metrics.keys() & after_metrics.keys()) - if before_metrics[name] != after_metrics[name] + MetricChange(name, before_metrics.get(name, 0.0), after_metrics.get(name, 0.0)) + for name in sorted(before_metrics.keys() | after_metrics.keys()) + if before_metrics.get(name) != after_metrics.get(name) ] return SnapshotDiff( diff --git a/src/infospace_bench/evaluation_io.py b/src/infospace_bench/evaluation_io.py new file mode 100644 index 0000000..d6fc287 --- /dev/null +++ b/src/infospace_bench/evaluation_io.py @@ -0,0 +1,181 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +from typing import Any + +import yaml + +from .errors import InfospaceError +from .evaluation import ( + EntityEvaluation, + EvaluationSnapshot, + ScoreEntry, + diff_snapshots, +) + +FRONTMATTER_MARKER = "---" + + +def write_entity_evaluation( + evaluation: EntityEvaluation, + path: str | Path, +) -> None: + target = Path(path) + frontmatter: dict[str, Any] = { + "artifact_id": evaluation.artifact_id, + "evaluator": evaluation.evaluator, + "evaluated_at": evaluation.evaluated_at.isoformat(), + "overall_score": round(evaluation.overall_score, 4), + "scores": [score.to_dict() for score in evaluation.scores], + } + if evaluation.notes: + frontmatter["notes"] = evaluation.notes + + lines = [ + FRONTMATTER_MARKER, + yaml.safe_dump(frontmatter, sort_keys=False).rstrip(), + FRONTMATTER_MARKER, + "", + f"# Evaluation: {evaluation.artifact_id}", + "", + ] + for score in evaluation.scores: + lines.append(f"## {score.name} - {score.value} / {score.max_value}") + lines.append("") + if score.rationale: + lines.append(score.rationale) + lines.append("") + + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text("\n".join(lines), encoding="utf-8") + + +def read_entity_evaluation(path: str | Path) -> EntityEvaluation: + source = Path(path) + frontmatter, body = _read_frontmatter_markdown(source) + rationales = _parse_rationales(body) + scores = [ + _score_with_body_rationale(ScoreEntry.from_dict(item), rationales) + for item in frontmatter.get("scores", []) + ] + artifact_id = frontmatter.get("artifact_id", frontmatter.get("entity_slug")) + if not artifact_id: + raise InfospaceError( + "invalid_evaluation_file", + f"Missing artifact_id in evaluation file: {source}", + {"path": str(source)}, + ) + return EntityEvaluation( + artifact_id=str(artifact_id), + evaluator=str(frontmatter["evaluator"]), + scores=scores, + evaluated_at=datetime.fromisoformat(str(frontmatter["evaluated_at"])), + notes=list(frontmatter.get("notes") or []), + ) + + +def write_snapshot(snapshot: EvaluationSnapshot, path: str | Path) -> None: + target = Path(path) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text( + yaml.safe_dump(snapshot.to_dict(), sort_keys=False), + encoding="utf-8", + ) + + +def read_snapshot(path: str | Path) -> EvaluationSnapshot: + source = Path(path) + data = yaml.safe_load(source.read_text(encoding="utf-8")) + if not isinstance(data, dict): + raise InfospaceError( + "invalid_snapshot_file", + f"Expected mapping in snapshot file: {source}", + {"path": str(source)}, + ) + return EvaluationSnapshot.from_dict(data) + + +def append_to_history(snapshot: EvaluationSnapshot, history_path: str | Path) -> None: + target = Path(history_path) + existing = [item.to_dict() for item in read_history(target)] if target.exists() else [] + existing.append(snapshot.to_dict()) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text( + yaml.safe_dump(existing, sort_keys=False), + encoding="utf-8", + ) + + +def read_history(history_path: str | Path) -> list[EvaluationSnapshot]: + source = Path(history_path) + if not source.is_file(): + return [] + data = yaml.safe_load(source.read_text(encoding="utf-8")) + if data is None: + return [] + if not isinstance(data, list): + raise InfospaceError( + "invalid_history_file", + f"Expected list in history file: {source}", + {"path": str(source)}, + ) + return [EvaluationSnapshot.from_dict(item) for item in data] + + +def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]: + text = path.read_text(encoding="utf-8") + if not text.startswith(f"{FRONTMATTER_MARKER}\n"): + raise InfospaceError( + "invalid_evaluation_file", + f"Missing YAML frontmatter in evaluation file: {path}", + {"path": str(path)}, + ) + end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1) + if end == -1: + raise InfospaceError( + "invalid_evaluation_file", + f"Unclosed YAML frontmatter in evaluation file: {path}", + {"path": str(path)}, + ) + raw = text[len(FRONTMATTER_MARKER) + 1 : end] + body = text[end + len(FRONTMATTER_MARKER) + 2 :] + data = yaml.safe_load(raw) + if not isinstance(data, dict): + raise InfospaceError( + "invalid_evaluation_file", + f"Expected mapping frontmatter in evaluation file: {path}", + {"path": str(path)}, + ) + return data, body + + +def _parse_rationales(body: str) -> dict[str, str]: + rationales: dict[str, str] = {} + current_name: str | None = None + current_lines: list[str] = [] + for line in body.splitlines(): + if line.startswith("## "): + if current_name is not None: + rationales[current_name] = "\n".join(current_lines).strip() + heading = line[3:].strip() + current_name = heading.split(" - ", maxsplit=1)[0].strip() + current_lines = [] + elif current_name is not None: + current_lines.append(line) + if current_name is not None: + rationales[current_name] = "\n".join(current_lines).strip() + return rationales + + +def _score_with_body_rationale( + score: ScoreEntry, + rationales: dict[str, str], +) -> ScoreEntry: + rationale = rationales.get(score.name, score.rationale) + return ScoreEntry( + name=score.name, + value=score.value, + max_value=score.max_value, + rationale=rationale, + ) diff --git a/src/infospace_bench/history.py b/src/infospace_bench/history.py new file mode 100644 index 0000000..22d665e --- /dev/null +++ b/src/infospace_bench/history.py @@ -0,0 +1,254 @@ +from __future__ import annotations + +import uuid +from dataclasses import dataclass +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +import yaml + +from .checks import CollectionCheckReport +from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue +from .evaluation_io import append_to_history, read_history, write_snapshot +from .lifecycle import load_infospace +from .viability import evaluate_viability + +METRICS_PATH = Path("output/metrics/metrics.yaml") +HISTORY_PATH = Path("output/metrics/history.yaml") +VIABILITY_PATH = Path("output/metrics/viability.yaml") +SNAPSHOT_DIR = Path("output/metrics/snapshots") + + +@dataclass(frozen=True) +class RecordedCheckResult: + snapshot: EvaluationSnapshot + metrics: dict[str, Any] + viability: dict[str, Any] | None = None + + def to_dict(self) -> dict[str, Any]: + return { + "snapshot": self.snapshot.to_dict(), + "metrics": self.metrics, + "viability": self.viability, + } + + +def snapshot_from_checks( + check_report: CollectionCheckReport, + artifact_count: int, + *, + schema_name: str = "default", + metadata: dict[str, Any] | None = None, + artifact_evaluations: list[EntityEvaluation] | None = None, +) -> EvaluationSnapshot: + metrics = _numeric_metrics(check_report.metrics) + collection_metrics = [ + MetricValue(name=name, value=value, concern=_concern_for_metric(name)) + for name, value in sorted(metrics.items()) + ] + collection_metrics.extend( + MetricValue(name=name, value=value, concern="evaluation") + for name, value in sorted( + _evaluation_metrics(artifact_evaluations or []).items() + ) + ) + return EvaluationSnapshot( + snapshot_id=str(uuid.uuid4())[:8], + created_at=datetime.now(timezone.utc), + schema_name=schema_name, + artifact_count=artifact_count, + artifact_evaluations=artifact_evaluations or [], + collection_metrics=collection_metrics, + metadata=metadata or {}, + ) + + +def write_metrics_file(metrics: dict[str, Any], path: str | Path) -> None: + target = Path(path) + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text( + yaml.safe_dump( + { + key: _normalize_metric_value(value) + for key, value in sorted(metrics.items()) + }, + sort_keys=True, + ), + encoding="utf-8", + ) + + +def read_metrics_file(path: str | Path) -> dict[str, Any]: + source = Path(path) + if not source.is_file(): + return {} + data = yaml.safe_load(source.read_text(encoding="utf-8")) + return data if isinstance(data, dict) else {} + + +def record_check_results( + root: str | Path, + check_report: CollectionCheckReport, + *, + artifact_evaluations: list[EntityEvaluation] | None = None, + schema_name: str = "default", + metadata: dict[str, Any] | None = None, +) -> RecordedCheckResult: + infospace = load_infospace(root) + artifact_count = int( + check_report.details.get("artifact_count", len(infospace.artifacts)) + ) + snapshot = snapshot_from_checks( + check_report, + artifact_count, + schema_name=schema_name, + metadata={"source": "collection-checks", **(metadata or {})}, + artifact_evaluations=artifact_evaluations, + ) + metrics_file = infospace.root / METRICS_PATH + existing = read_metrics_file(metrics_file) + merged = { + **existing, + **check_report.metrics, + **_evaluation_metrics(artifact_evaluations or []), + } + write_metrics_file(merged, metrics_file) + + history_path = infospace.root / HISTORY_PATH + append_to_history(snapshot, history_path) + write_snapshot( + snapshot, + infospace.root / SNAPSHOT_DIR / f"{snapshot.snapshot_id}.yaml", + ) + + viability = build_viability_report(infospace.root, merged) + write_viability_report(infospace.root, viability) + return RecordedCheckResult(snapshot=snapshot, metrics=merged, viability=viability) + + +def get_history(root: str | Path) -> list[EvaluationSnapshot]: + return read_history(Path(root) / HISTORY_PATH) + + +def get_latest_snapshot(root: str | Path) -> EvaluationSnapshot | None: + history = get_history(root) + return history[-1] if history else None + + +def find_snapshot( + history: list[EvaluationSnapshot], + ref: str, +) -> EvaluationSnapshot | None: + for snapshot in history: + if snapshot.snapshot_id == ref: + return snapshot + return find_snapshot_by_date(history, ref) + + +def find_snapshot_by_date( + history: list[EvaluationSnapshot], + date_ref: str, +) -> EvaluationSnapshot | None: + if not history: + return None + try: + target = datetime.fromisoformat( + date_ref if "T" in date_ref else f"{date_ref}T00:00:00" + ) + except ValueError: + return None + if target.tzinfo is None: + target = target.replace(tzinfo=timezone.utc) + + def delta(snapshot: EvaluationSnapshot) -> float: + created_at = snapshot.created_at + if created_at.tzinfo is None: + created_at = created_at.replace(tzinfo=timezone.utc) + return abs((created_at - target).total_seconds()) + + return min(history, key=delta) + + +def metric_trend( + history: list[EvaluationSnapshot], + metric_name: str, +) -> list[dict[str, Any]]: + trend: list[dict[str, Any]] = [] + for snapshot in history: + for metric in snapshot.collection_metrics: + if metric.name == metric_name: + trend.append( + {"date": snapshot.created_at.isoformat(), "value": metric.value} + ) + break + return trend + + +def build_viability_report( + root: str | Path, + metrics: dict[str, Any] | None = None, +) -> dict[str, Any]: + infospace = load_infospace(root) + current = ( + metrics + if metrics is not None + else read_metrics_file(infospace.root / METRICS_PATH) + ) + numeric = _numeric_metrics(current) + report = evaluate_viability(numeric, infospace.config.viability) + return { + "passed": report.passed, + "results": { + name: { + "metric": result.metric, + "value": result.value, + "threshold": result.threshold.to_dict(), + "passed": result.passed, + } + for name, result in report.results.items() + }, + } + + +def write_viability_report(root: str | Path, report: dict[str, Any]) -> None: + target = Path(root) / VIABILITY_PATH + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8") + + +def _evaluation_metrics(evaluations: list[EntityEvaluation]) -> dict[str, float | int]: + if not evaluations: + return {} + return { + "evaluated_artifact_count": len(evaluations), + "per_artifact_mean": sum(item.overall_score for item in evaluations) + / len(evaluations), + } + + +def _numeric_metrics(metrics: dict[str, Any]) -> dict[str, float]: + return { + str(name): float(value) + for name, value in metrics.items() + if isinstance(value, (int, float)) and not isinstance(value, bool) + } + + +def _normalize_metric_value(value: Any) -> Any: + if isinstance(value, bool): + return value + if isinstance(value, float): + return round(value, 6) + return value + + +def _concern_for_metric(name: str) -> str: + mapping = { + "redundancy_ratio": "C1", + "coverage_ratio": "C2", + "coherence_components": "C3", + "consistency_cycles": "C4", + "granularity_entropy": "C5", + } + return mapping.get(name, "") diff --git a/tests/test_evaluation_history.py b/tests/test_evaluation_history.py new file mode 100644 index 0000000..d38218a --- /dev/null +++ b/tests/test_evaluation_history.py @@ -0,0 +1,196 @@ +import json +import os +import subprocess +import sys +from datetime import datetime, timezone +from pathlib import Path + +import yaml + +from infospace_bench import add_artifact, create_infospace, load_infospace +from infospace_bench.checks import run_collection_checks +from infospace_bench.evaluation import ( + EntityEvaluation, + EvaluationSnapshot, + MetricValue, + ScoreEntry, +) +from infospace_bench.evaluation_io import ( + append_to_history, + read_entity_evaluation, + read_history, + read_snapshot, + write_entity_evaluation, + write_snapshot, +) +from infospace_bench.history import ( + find_snapshot, + get_history, + metric_trend, + read_metrics_file, + record_check_results, + write_metrics_file, +) + + +NOW = datetime(2026, 5, 14, 10, 30, tzinfo=timezone.utc) + + +def cli_env() -> dict[str, str]: + env = os.environ.copy() + env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src" + return env + + +def evaluation( + artifact_id: str = "entity/division.md", + value: float = 4.0, +) -> EntityEvaluation: + return EntityEvaluation( + artifact_id=artifact_id, + evaluator="test", + evaluated_at=NOW, + scores=[ + ScoreEntry("definition_precision", value, rationale="Grounded and clear."), + ScoreEntry("source_grounding", 5.0), + ], + notes=["reviewed"], + ) + + +def snapshot(snapshot_id: str, value: float, day: int) -> EvaluationSnapshot: + return EvaluationSnapshot( + snapshot_id=snapshot_id, + created_at=datetime(2026, 5, day, 9, tzinfo=timezone.utc), + schema_name="baseline", + artifact_count=1, + artifact_evaluations=[evaluation(value=value)], + collection_metrics=[MetricValue("coverage_ratio", value / 5.0, concern="C2")], + metadata={"source": "test"}, + ) + + +def test_entity_evaluation_file_preserves_frontmatter_and_markdown_body( + tmp_path: Path, +) -> None: + path = tmp_path / "output" / "evaluations" / "division.md" + + write_entity_evaluation(evaluation(), path) + restored = read_entity_evaluation(path) + text = path.read_text(encoding="utf-8") + + assert text.startswith("---\n") + assert "artifact_id: entity/division.md" in text + assert "# Evaluation: entity/division.md" in text + assert "## definition_precision" in text + assert restored == evaluation() + assert restored.scores[0].rationale == "Grounded and clear." + + +def test_snapshot_and_history_round_trip_and_diff_from_named_refs(tmp_path: Path) -> None: + first = snapshot("snap-a", 3.0, 1) + second = snapshot("snap-b", 4.0, 2) + snapshot_path = tmp_path / "output" / "metrics" / "snap-a.yaml" + history_path = tmp_path / "output" / "metrics" / "history.yaml" + + write_snapshot(first, snapshot_path) + append_to_history(first, history_path) + append_to_history(second, history_path) + + history = read_history(history_path) + diff = find_snapshot(history, "snap-a").diff(find_snapshot(history, "2026-05-02")) + + assert read_snapshot(snapshot_path) == first + assert [item.snapshot_id for item in history] == ["snap-a", "snap-b"] + assert get_history(tmp_path) == history + assert metric_trend(history, "coverage_ratio") == [ + {"date": "2026-05-01T09:00:00+00:00", "value": 0.6}, + {"date": "2026-05-02T09:00:00+00:00", "value": 0.8}, + ] + assert diff.score_changes[0].artifact_id == "entity/division.md" + assert diff.metric_changes[0].name == "coverage_ratio" + + +def test_metrics_file_preserves_structured_values_and_recording_merges( + tmp_path: Path, +) -> None: + infospace = create_infospace(tmp_path, "pilot", name="Pilot") + source = tmp_path / "chapter.md" + source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8") + add_artifact(infospace.root, source, kind="source", title="Chapter") + + config_path = infospace.root / "infospace.yaml" + config = yaml.safe_load(config_path.read_text(encoding="utf-8")) + config["viability"] = { + "coverage_ratio": {"min": 0.5}, + "redundancy_ratio": {"max": 0.1}, + } + config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8") + + metrics_path = infospace.root / "output" / "metrics" / "metrics.yaml" + write_metrics_file( + {"type_distribution": {"source": 1}, "manual_metric": 7, "rounded": 1.1234567}, + metrics_path, + ) + + report = record_check_results( + infospace.root, + run_collection_checks(load_infospace(infospace.root).artifacts), + artifact_evaluations=[evaluation()], + ) + metrics = read_metrics_file(metrics_path) + + assert metrics["type_distribution"] == {"source": 1} + assert metrics["manual_metric"] == 7 + assert metrics["rounded"] == 1.123457 + assert metrics["per_artifact_mean"] == 4.5 + assert metrics["coverage_ratio"] == 1.0 + assert report.viability is not None + assert report.viability["passed"] is True + + +def test_cli_history_history_diff_and_metrics_json(tmp_path: Path) -> None: + infospace = create_infospace(tmp_path, "pilot", name="Pilot") + source = tmp_path / "chapter.md" + source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8") + add_artifact(infospace.root, source, kind="source", title="Chapter") + history_path = infospace.root / "output" / "metrics" / "history.yaml" + append_to_history(snapshot("snap-a", 3.0, 1), history_path) + append_to_history(snapshot("snap-b", 4.0, 2), history_path) + + history = subprocess.run( + [sys.executable, "-m", "infospace_bench", "history", str(infospace.root)], + check=False, + env=cli_env(), + text=True, + capture_output=True, + ) + diff = subprocess.run( + [ + sys.executable, + "-m", + "infospace_bench", + "history-diff", + str(infospace.root), + "snap-a", + "snap-b", + ], + check=False, + env=cli_env(), + text=True, + capture_output=True, + ) + metrics = subprocess.run( + [sys.executable, "-m", "infospace_bench", "metrics", str(infospace.root)], + check=False, + env=cli_env(), + text=True, + capture_output=True, + ) + + assert history.returncode == 0, history.stderr + assert diff.returncode == 0, diff.stderr + assert metrics.returncode == 0, metrics.stderr + assert json.loads(history.stdout)["history"][1]["snapshot_id"] == "snap-b" + assert json.loads(diff.stdout)["diff"]["after_id"] == "snap-b" + assert json.loads(metrics.stdout)["snapshot"]["artifact_count"] == 1 diff --git a/workplans/IB-WP-0008-evaluation-history-metrics-parity.md b/workplans/IB-WP-0008-evaluation-history-metrics-parity.md index 2d36d8d..5f68592 100644 --- a/workplans/IB-WP-0008-evaluation-history-metrics-parity.md +++ b/workplans/IB-WP-0008-evaluation-history-metrics-parity.md @@ -4,7 +4,7 @@ type: workplan title: "Evaluation History And Metrics Parity" domain: markitect repo: infospace-bench -status: planned +status: completed owner: markitect topic_slug: markitect created: "2026-05-14" @@ -26,7 +26,7 @@ infospace evaluation history and metrics behavior. ```task id: IB-WP-0008-T01 -status: todo +status: done priority: high state_hub_task_id: "95b48ad3-c4d1-442c-9bc0-7591d948d23e" ``` @@ -39,7 +39,7 @@ state_hub_task_id: "95b48ad3-c4d1-442c-9bc0-7591d948d23e" ```task id: IB-WP-0008-T02 -status: todo +status: done priority: high state_hub_task_id: "b4800ba8-5b86-44bb-bf47-e893bae36b22" ``` @@ -52,7 +52,7 @@ state_hub_task_id: "b4800ba8-5b86-44bb-bf47-e893bae36b22" ```task id: IB-WP-0008-T03 -status: todo +status: done priority: high state_hub_task_id: "7abcbd63-0147-4ae8-85f0-4af51882f476" ``` @@ -65,7 +65,7 @@ state_hub_task_id: "7abcbd63-0147-4ae8-85f0-4af51882f476" ```task id: IB-WP-0008-T04 -status: todo +status: done priority: medium state_hub_task_id: "675d1d45-39d9-4ddd-9ab7-5d7de8a0f601" ```