generated from coulomb/repo-seed
eval history and metrics
This commit is contained in:
@@ -20,6 +20,7 @@ Start with:
|
||||
- `docs/markitect-main-scope-assessment.md`
|
||||
- `docs/markitect-tool-adapter.md`
|
||||
- `docs/entity-relation-model.md`
|
||||
- `docs/evaluation-history-and-metrics.md`
|
||||
- `docs/orthogonal-successor-roadmap.md`
|
||||
- `docs/legacy-infospace-feature-inventory.md`
|
||||
- `docs/successor-boundary-interface-map.md`
|
||||
|
||||
43
docs/evaluation-history-and-metrics.md
Normal file
43
docs/evaluation-history-and-metrics.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Evaluation History And Metrics
|
||||
|
||||
`infospace-bench` keeps evaluation history as committed, inspectable files under
|
||||
each infospace root. This replaces the legacy `markitect-project` history
|
||||
workflow while retaining the useful behaviors: Markdown evaluation files,
|
||||
append-only snapshot history, metric merging, and viability checks.
|
||||
|
||||
## Files
|
||||
|
||||
- `output/evaluations/*.md`: per-artifact evaluation files with YAML
|
||||
frontmatter and a human-readable Markdown body.
|
||||
- `output/metrics/metrics.yaml`: latest merged metrics. Collection metrics,
|
||||
evaluation-derived metrics, and structured non-numeric values are preserved.
|
||||
- `output/metrics/history.yaml`: append-only list of evaluation snapshots.
|
||||
- `output/metrics/snapshots/<snapshot-id>.yaml`: named snapshot copies for
|
||||
reproducible diffs.
|
||||
- `output/metrics/viability.yaml`: structured viability report generated from
|
||||
`infospace.yaml` thresholds and the current metrics file.
|
||||
|
||||
## Replacement Mapping
|
||||
|
||||
The old infospace history code used entity-oriented names such as
|
||||
`entity_count`, `entity_evaluations`, and `entity_slug`. The successor model
|
||||
uses artifact-oriented names:
|
||||
|
||||
- `artifact_count` replaces `entity_count`
|
||||
- `artifact_evaluations` replaces `entity_evaluations`
|
||||
- `artifact_id` replaces `entity_slug`
|
||||
|
||||
Readers accept the old snapshot aliases where practical so legacy fixtures can
|
||||
be inspected, but new files should use the artifact-oriented vocabulary.
|
||||
|
||||
## CLI
|
||||
|
||||
```bash
|
||||
python3 -m infospace_bench metrics infospaces/bootstrap-pilot
|
||||
python3 -m infospace_bench history infospaces/bootstrap-pilot
|
||||
python3 -m infospace_bench history infospaces/bootstrap-pilot --metric coverage_ratio
|
||||
python3 -m infospace_bench history-diff infospaces/bootstrap-pilot snap-a snap-b
|
||||
```
|
||||
|
||||
Snapshot references may be exact snapshot IDs or ISO-like dates such as
|
||||
`2026-05-14`. Date references resolve to the nearest snapshot in the history.
|
||||
@@ -1,5 +1,22 @@
|
||||
from .errors import InfospaceError
|
||||
from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue, ScoreEntry
|
||||
from .evaluation_io import (
|
||||
append_to_history,
|
||||
read_entity_evaluation,
|
||||
read_history,
|
||||
read_snapshot,
|
||||
write_entity_evaluation,
|
||||
write_snapshot,
|
||||
)
|
||||
from .history import (
|
||||
find_snapshot,
|
||||
get_history,
|
||||
get_latest_snapshot,
|
||||
metric_trend,
|
||||
read_metrics_file,
|
||||
record_check_results,
|
||||
write_metrics_file,
|
||||
)
|
||||
from .lifecycle import add_artifact, create_infospace, load_infospace
|
||||
from .models import (
|
||||
DisciplineBinding,
|
||||
@@ -26,8 +43,21 @@ __all__ = [
|
||||
"TopicConfig",
|
||||
"ViabilityThreshold",
|
||||
"add_artifact",
|
||||
"append_to_history",
|
||||
"create_infospace",
|
||||
"find_snapshot",
|
||||
"get_history",
|
||||
"get_latest_snapshot",
|
||||
"list_entities",
|
||||
"list_relations",
|
||||
"load_infospace",
|
||||
"metric_trend",
|
||||
"read_entity_evaluation",
|
||||
"read_history",
|
||||
"read_metrics_file",
|
||||
"read_snapshot",
|
||||
"record_check_results",
|
||||
"write_entity_evaluation",
|
||||
"write_metrics_file",
|
||||
"write_snapshot",
|
||||
]
|
||||
|
||||
@@ -5,7 +5,9 @@ import json
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from .checks import run_collection_checks
|
||||
from .errors import InfospaceError
|
||||
from .history import find_snapshot, get_history, metric_trend, record_check_results
|
||||
from .lifecycle import add_artifact, create_infospace, load_infospace
|
||||
from .markdown_adapter import validate_infospace_artifacts
|
||||
from .semantics import list_entities, list_relations
|
||||
@@ -42,6 +44,24 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
relations = sub.add_parser("relations", help="List parsed relation artifacts")
|
||||
relations.add_argument("root")
|
||||
|
||||
history = sub.add_parser("history", help="List evaluation snapshot history")
|
||||
history.add_argument("root")
|
||||
history.add_argument("--metric", default="")
|
||||
|
||||
history_diff = sub.add_parser(
|
||||
"history-diff",
|
||||
help="Diff two evaluation snapshots by snapshot ID or date",
|
||||
)
|
||||
history_diff.add_argument("root")
|
||||
history_diff.add_argument("before")
|
||||
history_diff.add_argument("after")
|
||||
|
||||
metrics = sub.add_parser(
|
||||
"metrics",
|
||||
help="Run collection checks and persist metrics/history",
|
||||
)
|
||||
metrics.add_argument("root")
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
@@ -96,6 +116,40 @@ def main(argv: list[str] | None = None) -> int:
|
||||
]
|
||||
}
|
||||
)
|
||||
elif args.command == "history":
|
||||
history = get_history(Path(args.root))
|
||||
if args.metric:
|
||||
_write_json(
|
||||
{
|
||||
"metric": args.metric,
|
||||
"trend": metric_trend(history, args.metric),
|
||||
}
|
||||
)
|
||||
else:
|
||||
_write_json({"history": [item.to_dict() for item in history]})
|
||||
elif args.command == "history-diff":
|
||||
history = get_history(Path(args.root))
|
||||
before = find_snapshot(history, args.before)
|
||||
after = find_snapshot(history, args.after)
|
||||
if before is None or after is None:
|
||||
missing = []
|
||||
if before is None:
|
||||
missing.append(args.before)
|
||||
if after is None:
|
||||
missing.append(args.after)
|
||||
raise InfospaceError(
|
||||
"missing_snapshot",
|
||||
"Could not resolve requested snapshot reference",
|
||||
{"missing_refs": missing},
|
||||
)
|
||||
_write_json({"diff": before.diff(after).to_dict()})
|
||||
elif args.command == "metrics":
|
||||
infospace = load_infospace(Path(args.root))
|
||||
result = record_check_results(
|
||||
infospace.root,
|
||||
run_collection_checks(infospace.artifacts),
|
||||
)
|
||||
_write_json(result.to_dict())
|
||||
else:
|
||||
parser.error(f"Unhandled command: {args.command}")
|
||||
except InfospaceError as exc:
|
||||
|
||||
@@ -40,6 +40,11 @@ class EntityEvaluation:
|
||||
evaluated_at: datetime
|
||||
notes: list[str] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def entity_slug(self) -> str:
|
||||
"""Legacy alias for readers moving from entity-oriented history files."""
|
||||
return self.artifact_id
|
||||
|
||||
@property
|
||||
def overall_score(self) -> float:
|
||||
if not self.scores:
|
||||
@@ -102,6 +107,16 @@ class EvaluationSnapshot:
|
||||
collection_metrics: list[MetricValue] = field(default_factory=list)
|
||||
metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def entity_count(self) -> int:
|
||||
"""Legacy alias retained for old infospace history readers."""
|
||||
return self.artifact_count
|
||||
|
||||
@property
|
||||
def entity_evaluations(self) -> list[EntityEvaluation]:
|
||||
"""Legacy alias retained for old infospace history readers."""
|
||||
return self.artifact_evaluations
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"snapshot_id": self.snapshot_id,
|
||||
@@ -122,11 +137,14 @@ class EvaluationSnapshot:
|
||||
return cls(
|
||||
snapshot_id=str(data["snapshot_id"]),
|
||||
created_at=datetime.fromisoformat(str(data["created_at"])),
|
||||
schema_name=str(data["schema_name"]),
|
||||
artifact_count=int(data["artifact_count"]),
|
||||
schema_name=str(data.get("schema_name") or "default"),
|
||||
artifact_count=int(data.get("artifact_count", data.get("entity_count", 0))),
|
||||
artifact_evaluations=[
|
||||
EntityEvaluation.from_dict(item)
|
||||
for item in data.get("artifact_evaluations", [])
|
||||
for item in data.get(
|
||||
"artifact_evaluations",
|
||||
data.get("entity_evaluations", []),
|
||||
)
|
||||
],
|
||||
collection_metrics=[
|
||||
MetricValue.from_dict(item) for item in data.get("collection_metrics", [])
|
||||
@@ -134,6 +152,9 @@ class EvaluationSnapshot:
|
||||
metadata=dict(data.get("metadata") or {}),
|
||||
)
|
||||
|
||||
def diff(self, after: "EvaluationSnapshot") -> "SnapshotDiff":
|
||||
return diff_snapshots(self, after)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ScoreChange:
|
||||
@@ -146,6 +167,20 @@ class ScoreChange:
|
||||
def delta(self) -> float:
|
||||
return self.after - self.before
|
||||
|
||||
@property
|
||||
def entity_slug(self) -> str:
|
||||
"""Legacy alias for old diff consumers."""
|
||||
return self.artifact_id
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"artifact_id": self.artifact_id,
|
||||
"dimension": self.dimension,
|
||||
"before": self.before,
|
||||
"after": self.after,
|
||||
"delta": self.delta,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class MetricChange:
|
||||
@@ -157,6 +192,14 @@ class MetricChange:
|
||||
def delta(self) -> float:
|
||||
return self.after - self.before
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"name": self.name,
|
||||
"before": self.before,
|
||||
"after": self.after,
|
||||
"delta": self.delta,
|
||||
}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class SnapshotDiff:
|
||||
@@ -167,6 +210,51 @@ class SnapshotDiff:
|
||||
score_changes: list[ScoreChange] = field(default_factory=list)
|
||||
metric_changes: list[MetricChange] = field(default_factory=list)
|
||||
|
||||
@property
|
||||
def added_entities(self) -> list[str]:
|
||||
"""Legacy alias for old history diff output."""
|
||||
return self.added_artifacts
|
||||
|
||||
@property
|
||||
def removed_entities(self) -> list[str]:
|
||||
"""Legacy alias for old history diff output."""
|
||||
return self.removed_artifacts
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"before_id": self.before_id,
|
||||
"after_id": self.after_id,
|
||||
"added_artifacts": self.added_artifacts,
|
||||
"removed_artifacts": self.removed_artifacts,
|
||||
"score_changes": [change.to_dict() for change in self.score_changes],
|
||||
"metric_changes": [change.to_dict() for change in self.metric_changes],
|
||||
}
|
||||
|
||||
def summary(self) -> str:
|
||||
lines = [f"Snapshot diff: {self.before_id} -> {self.after_id}"]
|
||||
if not (
|
||||
self.added_artifacts
|
||||
or self.removed_artifacts
|
||||
or self.score_changes
|
||||
or self.metric_changes
|
||||
):
|
||||
return "\n".join([*lines, "No changes."])
|
||||
for artifact_id in self.added_artifacts:
|
||||
lines.append(f"Added artifact: {artifact_id}")
|
||||
for artifact_id in self.removed_artifacts:
|
||||
lines.append(f"Removed artifact: {artifact_id}")
|
||||
for change in self.score_changes:
|
||||
lines.append(
|
||||
f"Score {change.artifact_id} {change.dimension}: "
|
||||
f"{change.before} -> {change.after} ({change.delta:+.4f})"
|
||||
)
|
||||
for change in self.metric_changes:
|
||||
lines.append(
|
||||
f"Metric {change.name}: "
|
||||
f"{change.before} -> {change.after} ({change.delta:+.4f})"
|
||||
)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def diff_snapshots(
|
||||
before: EvaluationSnapshot,
|
||||
@@ -174,22 +262,29 @@ def diff_snapshots(
|
||||
) -> SnapshotDiff:
|
||||
before_scores = _score_index(before)
|
||||
after_scores = _score_index(after)
|
||||
before_artifacts = {artifact_id for artifact_id, _ in before_scores}
|
||||
after_artifacts = {artifact_id for artifact_id, _ in after_scores}
|
||||
before_artifacts = {
|
||||
evaluation.artifact_id for evaluation in before.artifact_evaluations
|
||||
}
|
||||
after_artifacts = {evaluation.artifact_id for evaluation in after.artifact_evaluations}
|
||||
|
||||
score_changes = [
|
||||
ScoreChange(artifact_id, dimension, before_scores[key], after_scores[key])
|
||||
for key in sorted(before_scores.keys() & after_scores.keys())
|
||||
ScoreChange(
|
||||
artifact_id,
|
||||
dimension,
|
||||
before_scores.get(key, 0.0),
|
||||
after_scores.get(key, 0.0),
|
||||
)
|
||||
for key in sorted(before_scores.keys() | after_scores.keys())
|
||||
for artifact_id, dimension in [key]
|
||||
if before_scores[key] != after_scores[key]
|
||||
if before_scores.get(key) != after_scores.get(key)
|
||||
]
|
||||
|
||||
before_metrics = {metric.name: metric.value for metric in before.collection_metrics}
|
||||
after_metrics = {metric.name: metric.value for metric in after.collection_metrics}
|
||||
metric_changes = [
|
||||
MetricChange(name, before_metrics[name], after_metrics[name])
|
||||
for name in sorted(before_metrics.keys() & after_metrics.keys())
|
||||
if before_metrics[name] != after_metrics[name]
|
||||
MetricChange(name, before_metrics.get(name, 0.0), after_metrics.get(name, 0.0))
|
||||
for name in sorted(before_metrics.keys() | after_metrics.keys())
|
||||
if before_metrics.get(name) != after_metrics.get(name)
|
||||
]
|
||||
|
||||
return SnapshotDiff(
|
||||
|
||||
181
src/infospace_bench/evaluation_io.py
Normal file
181
src/infospace_bench/evaluation_io.py
Normal file
@@ -0,0 +1,181 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
from .errors import InfospaceError
|
||||
from .evaluation import (
|
||||
EntityEvaluation,
|
||||
EvaluationSnapshot,
|
||||
ScoreEntry,
|
||||
diff_snapshots,
|
||||
)
|
||||
|
||||
FRONTMATTER_MARKER = "---"
|
||||
|
||||
|
||||
def write_entity_evaluation(
|
||||
evaluation: EntityEvaluation,
|
||||
path: str | Path,
|
||||
) -> None:
|
||||
target = Path(path)
|
||||
frontmatter: dict[str, Any] = {
|
||||
"artifact_id": evaluation.artifact_id,
|
||||
"evaluator": evaluation.evaluator,
|
||||
"evaluated_at": evaluation.evaluated_at.isoformat(),
|
||||
"overall_score": round(evaluation.overall_score, 4),
|
||||
"scores": [score.to_dict() for score in evaluation.scores],
|
||||
}
|
||||
if evaluation.notes:
|
||||
frontmatter["notes"] = evaluation.notes
|
||||
|
||||
lines = [
|
||||
FRONTMATTER_MARKER,
|
||||
yaml.safe_dump(frontmatter, sort_keys=False).rstrip(),
|
||||
FRONTMATTER_MARKER,
|
||||
"",
|
||||
f"# Evaluation: {evaluation.artifact_id}",
|
||||
"",
|
||||
]
|
||||
for score in evaluation.scores:
|
||||
lines.append(f"## {score.name} - {score.value} / {score.max_value}")
|
||||
lines.append("")
|
||||
if score.rationale:
|
||||
lines.append(score.rationale)
|
||||
lines.append("")
|
||||
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text("\n".join(lines), encoding="utf-8")
|
||||
|
||||
|
||||
def read_entity_evaluation(path: str | Path) -> EntityEvaluation:
|
||||
source = Path(path)
|
||||
frontmatter, body = _read_frontmatter_markdown(source)
|
||||
rationales = _parse_rationales(body)
|
||||
scores = [
|
||||
_score_with_body_rationale(ScoreEntry.from_dict(item), rationales)
|
||||
for item in frontmatter.get("scores", [])
|
||||
]
|
||||
artifact_id = frontmatter.get("artifact_id", frontmatter.get("entity_slug"))
|
||||
if not artifact_id:
|
||||
raise InfospaceError(
|
||||
"invalid_evaluation_file",
|
||||
f"Missing artifact_id in evaluation file: {source}",
|
||||
{"path": str(source)},
|
||||
)
|
||||
return EntityEvaluation(
|
||||
artifact_id=str(artifact_id),
|
||||
evaluator=str(frontmatter["evaluator"]),
|
||||
scores=scores,
|
||||
evaluated_at=datetime.fromisoformat(str(frontmatter["evaluated_at"])),
|
||||
notes=list(frontmatter.get("notes") or []),
|
||||
)
|
||||
|
||||
|
||||
def write_snapshot(snapshot: EvaluationSnapshot, path: str | Path) -> None:
|
||||
target = Path(path)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(
|
||||
yaml.safe_dump(snapshot.to_dict(), sort_keys=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def read_snapshot(path: str | Path) -> EvaluationSnapshot:
|
||||
source = Path(path)
|
||||
data = yaml.safe_load(source.read_text(encoding="utf-8"))
|
||||
if not isinstance(data, dict):
|
||||
raise InfospaceError(
|
||||
"invalid_snapshot_file",
|
||||
f"Expected mapping in snapshot file: {source}",
|
||||
{"path": str(source)},
|
||||
)
|
||||
return EvaluationSnapshot.from_dict(data)
|
||||
|
||||
|
||||
def append_to_history(snapshot: EvaluationSnapshot, history_path: str | Path) -> None:
|
||||
target = Path(history_path)
|
||||
existing = [item.to_dict() for item in read_history(target)] if target.exists() else []
|
||||
existing.append(snapshot.to_dict())
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(
|
||||
yaml.safe_dump(existing, sort_keys=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def read_history(history_path: str | Path) -> list[EvaluationSnapshot]:
|
||||
source = Path(history_path)
|
||||
if not source.is_file():
|
||||
return []
|
||||
data = yaml.safe_load(source.read_text(encoding="utf-8"))
|
||||
if data is None:
|
||||
return []
|
||||
if not isinstance(data, list):
|
||||
raise InfospaceError(
|
||||
"invalid_history_file",
|
||||
f"Expected list in history file: {source}",
|
||||
{"path": str(source)},
|
||||
)
|
||||
return [EvaluationSnapshot.from_dict(item) for item in data]
|
||||
|
||||
|
||||
def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]:
|
||||
text = path.read_text(encoding="utf-8")
|
||||
if not text.startswith(f"{FRONTMATTER_MARKER}\n"):
|
||||
raise InfospaceError(
|
||||
"invalid_evaluation_file",
|
||||
f"Missing YAML frontmatter in evaluation file: {path}",
|
||||
{"path": str(path)},
|
||||
)
|
||||
end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1)
|
||||
if end == -1:
|
||||
raise InfospaceError(
|
||||
"invalid_evaluation_file",
|
||||
f"Unclosed YAML frontmatter in evaluation file: {path}",
|
||||
{"path": str(path)},
|
||||
)
|
||||
raw = text[len(FRONTMATTER_MARKER) + 1 : end]
|
||||
body = text[end + len(FRONTMATTER_MARKER) + 2 :]
|
||||
data = yaml.safe_load(raw)
|
||||
if not isinstance(data, dict):
|
||||
raise InfospaceError(
|
||||
"invalid_evaluation_file",
|
||||
f"Expected mapping frontmatter in evaluation file: {path}",
|
||||
{"path": str(path)},
|
||||
)
|
||||
return data, body
|
||||
|
||||
|
||||
def _parse_rationales(body: str) -> dict[str, str]:
|
||||
rationales: dict[str, str] = {}
|
||||
current_name: str | None = None
|
||||
current_lines: list[str] = []
|
||||
for line in body.splitlines():
|
||||
if line.startswith("## "):
|
||||
if current_name is not None:
|
||||
rationales[current_name] = "\n".join(current_lines).strip()
|
||||
heading = line[3:].strip()
|
||||
current_name = heading.split(" - ", maxsplit=1)[0].strip()
|
||||
current_lines = []
|
||||
elif current_name is not None:
|
||||
current_lines.append(line)
|
||||
if current_name is not None:
|
||||
rationales[current_name] = "\n".join(current_lines).strip()
|
||||
return rationales
|
||||
|
||||
|
||||
def _score_with_body_rationale(
|
||||
score: ScoreEntry,
|
||||
rationales: dict[str, str],
|
||||
) -> ScoreEntry:
|
||||
rationale = rationales.get(score.name, score.rationale)
|
||||
return ScoreEntry(
|
||||
name=score.name,
|
||||
value=score.value,
|
||||
max_value=score.max_value,
|
||||
rationale=rationale,
|
||||
)
|
||||
254
src/infospace_bench/history.py
Normal file
254
src/infospace_bench/history.py
Normal file
@@ -0,0 +1,254 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
|
||||
from .checks import CollectionCheckReport
|
||||
from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue
|
||||
from .evaluation_io import append_to_history, read_history, write_snapshot
|
||||
from .lifecycle import load_infospace
|
||||
from .viability import evaluate_viability
|
||||
|
||||
METRICS_PATH = Path("output/metrics/metrics.yaml")
|
||||
HISTORY_PATH = Path("output/metrics/history.yaml")
|
||||
VIABILITY_PATH = Path("output/metrics/viability.yaml")
|
||||
SNAPSHOT_DIR = Path("output/metrics/snapshots")
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RecordedCheckResult:
|
||||
snapshot: EvaluationSnapshot
|
||||
metrics: dict[str, Any]
|
||||
viability: dict[str, Any] | None = None
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return {
|
||||
"snapshot": self.snapshot.to_dict(),
|
||||
"metrics": self.metrics,
|
||||
"viability": self.viability,
|
||||
}
|
||||
|
||||
|
||||
def snapshot_from_checks(
|
||||
check_report: CollectionCheckReport,
|
||||
artifact_count: int,
|
||||
*,
|
||||
schema_name: str = "default",
|
||||
metadata: dict[str, Any] | None = None,
|
||||
artifact_evaluations: list[EntityEvaluation] | None = None,
|
||||
) -> EvaluationSnapshot:
|
||||
metrics = _numeric_metrics(check_report.metrics)
|
||||
collection_metrics = [
|
||||
MetricValue(name=name, value=value, concern=_concern_for_metric(name))
|
||||
for name, value in sorted(metrics.items())
|
||||
]
|
||||
collection_metrics.extend(
|
||||
MetricValue(name=name, value=value, concern="evaluation")
|
||||
for name, value in sorted(
|
||||
_evaluation_metrics(artifact_evaluations or []).items()
|
||||
)
|
||||
)
|
||||
return EvaluationSnapshot(
|
||||
snapshot_id=str(uuid.uuid4())[:8],
|
||||
created_at=datetime.now(timezone.utc),
|
||||
schema_name=schema_name,
|
||||
artifact_count=artifact_count,
|
||||
artifact_evaluations=artifact_evaluations or [],
|
||||
collection_metrics=collection_metrics,
|
||||
metadata=metadata or {},
|
||||
)
|
||||
|
||||
|
||||
def write_metrics_file(metrics: dict[str, Any], path: str | Path) -> None:
|
||||
target = Path(path)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(
|
||||
yaml.safe_dump(
|
||||
{
|
||||
key: _normalize_metric_value(value)
|
||||
for key, value in sorted(metrics.items())
|
||||
},
|
||||
sort_keys=True,
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def read_metrics_file(path: str | Path) -> dict[str, Any]:
|
||||
source = Path(path)
|
||||
if not source.is_file():
|
||||
return {}
|
||||
data = yaml.safe_load(source.read_text(encoding="utf-8"))
|
||||
return data if isinstance(data, dict) else {}
|
||||
|
||||
|
||||
def record_check_results(
|
||||
root: str | Path,
|
||||
check_report: CollectionCheckReport,
|
||||
*,
|
||||
artifact_evaluations: list[EntityEvaluation] | None = None,
|
||||
schema_name: str = "default",
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> RecordedCheckResult:
|
||||
infospace = load_infospace(root)
|
||||
artifact_count = int(
|
||||
check_report.details.get("artifact_count", len(infospace.artifacts))
|
||||
)
|
||||
snapshot = snapshot_from_checks(
|
||||
check_report,
|
||||
artifact_count,
|
||||
schema_name=schema_name,
|
||||
metadata={"source": "collection-checks", **(metadata or {})},
|
||||
artifact_evaluations=artifact_evaluations,
|
||||
)
|
||||
metrics_file = infospace.root / METRICS_PATH
|
||||
existing = read_metrics_file(metrics_file)
|
||||
merged = {
|
||||
**existing,
|
||||
**check_report.metrics,
|
||||
**_evaluation_metrics(artifact_evaluations or []),
|
||||
}
|
||||
write_metrics_file(merged, metrics_file)
|
||||
|
||||
history_path = infospace.root / HISTORY_PATH
|
||||
append_to_history(snapshot, history_path)
|
||||
write_snapshot(
|
||||
snapshot,
|
||||
infospace.root / SNAPSHOT_DIR / f"{snapshot.snapshot_id}.yaml",
|
||||
)
|
||||
|
||||
viability = build_viability_report(infospace.root, merged)
|
||||
write_viability_report(infospace.root, viability)
|
||||
return RecordedCheckResult(snapshot=snapshot, metrics=merged, viability=viability)
|
||||
|
||||
|
||||
def get_history(root: str | Path) -> list[EvaluationSnapshot]:
|
||||
return read_history(Path(root) / HISTORY_PATH)
|
||||
|
||||
|
||||
def get_latest_snapshot(root: str | Path) -> EvaluationSnapshot | None:
|
||||
history = get_history(root)
|
||||
return history[-1] if history else None
|
||||
|
||||
|
||||
def find_snapshot(
|
||||
history: list[EvaluationSnapshot],
|
||||
ref: str,
|
||||
) -> EvaluationSnapshot | None:
|
||||
for snapshot in history:
|
||||
if snapshot.snapshot_id == ref:
|
||||
return snapshot
|
||||
return find_snapshot_by_date(history, ref)
|
||||
|
||||
|
||||
def find_snapshot_by_date(
|
||||
history: list[EvaluationSnapshot],
|
||||
date_ref: str,
|
||||
) -> EvaluationSnapshot | None:
|
||||
if not history:
|
||||
return None
|
||||
try:
|
||||
target = datetime.fromisoformat(
|
||||
date_ref if "T" in date_ref else f"{date_ref}T00:00:00"
|
||||
)
|
||||
except ValueError:
|
||||
return None
|
||||
if target.tzinfo is None:
|
||||
target = target.replace(tzinfo=timezone.utc)
|
||||
|
||||
def delta(snapshot: EvaluationSnapshot) -> float:
|
||||
created_at = snapshot.created_at
|
||||
if created_at.tzinfo is None:
|
||||
created_at = created_at.replace(tzinfo=timezone.utc)
|
||||
return abs((created_at - target).total_seconds())
|
||||
|
||||
return min(history, key=delta)
|
||||
|
||||
|
||||
def metric_trend(
|
||||
history: list[EvaluationSnapshot],
|
||||
metric_name: str,
|
||||
) -> list[dict[str, Any]]:
|
||||
trend: list[dict[str, Any]] = []
|
||||
for snapshot in history:
|
||||
for metric in snapshot.collection_metrics:
|
||||
if metric.name == metric_name:
|
||||
trend.append(
|
||||
{"date": snapshot.created_at.isoformat(), "value": metric.value}
|
||||
)
|
||||
break
|
||||
return trend
|
||||
|
||||
|
||||
def build_viability_report(
|
||||
root: str | Path,
|
||||
metrics: dict[str, Any] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
infospace = load_infospace(root)
|
||||
current = (
|
||||
metrics
|
||||
if metrics is not None
|
||||
else read_metrics_file(infospace.root / METRICS_PATH)
|
||||
)
|
||||
numeric = _numeric_metrics(current)
|
||||
report = evaluate_viability(numeric, infospace.config.viability)
|
||||
return {
|
||||
"passed": report.passed,
|
||||
"results": {
|
||||
name: {
|
||||
"metric": result.metric,
|
||||
"value": result.value,
|
||||
"threshold": result.threshold.to_dict(),
|
||||
"passed": result.passed,
|
||||
}
|
||||
for name, result in report.results.items()
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def write_viability_report(root: str | Path, report: dict[str, Any]) -> None:
|
||||
target = Path(root) / VIABILITY_PATH
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
|
||||
|
||||
|
||||
def _evaluation_metrics(evaluations: list[EntityEvaluation]) -> dict[str, float | int]:
|
||||
if not evaluations:
|
||||
return {}
|
||||
return {
|
||||
"evaluated_artifact_count": len(evaluations),
|
||||
"per_artifact_mean": sum(item.overall_score for item in evaluations)
|
||||
/ len(evaluations),
|
||||
}
|
||||
|
||||
|
||||
def _numeric_metrics(metrics: dict[str, Any]) -> dict[str, float]:
|
||||
return {
|
||||
str(name): float(value)
|
||||
for name, value in metrics.items()
|
||||
if isinstance(value, (int, float)) and not isinstance(value, bool)
|
||||
}
|
||||
|
||||
|
||||
def _normalize_metric_value(value: Any) -> Any:
|
||||
if isinstance(value, bool):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return round(value, 6)
|
||||
return value
|
||||
|
||||
|
||||
def _concern_for_metric(name: str) -> str:
|
||||
mapping = {
|
||||
"redundancy_ratio": "C1",
|
||||
"coverage_ratio": "C2",
|
||||
"coherence_components": "C3",
|
||||
"consistency_cycles": "C4",
|
||||
"granularity_entropy": "C5",
|
||||
}
|
||||
return mapping.get(name, "")
|
||||
196
tests/test_evaluation_history.py
Normal file
196
tests/test_evaluation_history.py
Normal file
@@ -0,0 +1,196 @@
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from infospace_bench import add_artifact, create_infospace, load_infospace
|
||||
from infospace_bench.checks import run_collection_checks
|
||||
from infospace_bench.evaluation import (
|
||||
EntityEvaluation,
|
||||
EvaluationSnapshot,
|
||||
MetricValue,
|
||||
ScoreEntry,
|
||||
)
|
||||
from infospace_bench.evaluation_io import (
|
||||
append_to_history,
|
||||
read_entity_evaluation,
|
||||
read_history,
|
||||
read_snapshot,
|
||||
write_entity_evaluation,
|
||||
write_snapshot,
|
||||
)
|
||||
from infospace_bench.history import (
|
||||
find_snapshot,
|
||||
get_history,
|
||||
metric_trend,
|
||||
read_metrics_file,
|
||||
record_check_results,
|
||||
write_metrics_file,
|
||||
)
|
||||
|
||||
|
||||
NOW = datetime(2026, 5, 14, 10, 30, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def cli_env() -> dict[str, str]:
|
||||
env = os.environ.copy()
|
||||
env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src"
|
||||
return env
|
||||
|
||||
|
||||
def evaluation(
|
||||
artifact_id: str = "entity/division.md",
|
||||
value: float = 4.0,
|
||||
) -> EntityEvaluation:
|
||||
return EntityEvaluation(
|
||||
artifact_id=artifact_id,
|
||||
evaluator="test",
|
||||
evaluated_at=NOW,
|
||||
scores=[
|
||||
ScoreEntry("definition_precision", value, rationale="Grounded and clear."),
|
||||
ScoreEntry("source_grounding", 5.0),
|
||||
],
|
||||
notes=["reviewed"],
|
||||
)
|
||||
|
||||
|
||||
def snapshot(snapshot_id: str, value: float, day: int) -> EvaluationSnapshot:
|
||||
return EvaluationSnapshot(
|
||||
snapshot_id=snapshot_id,
|
||||
created_at=datetime(2026, 5, day, 9, tzinfo=timezone.utc),
|
||||
schema_name="baseline",
|
||||
artifact_count=1,
|
||||
artifact_evaluations=[evaluation(value=value)],
|
||||
collection_metrics=[MetricValue("coverage_ratio", value / 5.0, concern="C2")],
|
||||
metadata={"source": "test"},
|
||||
)
|
||||
|
||||
|
||||
def test_entity_evaluation_file_preserves_frontmatter_and_markdown_body(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
path = tmp_path / "output" / "evaluations" / "division.md"
|
||||
|
||||
write_entity_evaluation(evaluation(), path)
|
||||
restored = read_entity_evaluation(path)
|
||||
text = path.read_text(encoding="utf-8")
|
||||
|
||||
assert text.startswith("---\n")
|
||||
assert "artifact_id: entity/division.md" in text
|
||||
assert "# Evaluation: entity/division.md" in text
|
||||
assert "## definition_precision" in text
|
||||
assert restored == evaluation()
|
||||
assert restored.scores[0].rationale == "Grounded and clear."
|
||||
|
||||
|
||||
def test_snapshot_and_history_round_trip_and_diff_from_named_refs(tmp_path: Path) -> None:
|
||||
first = snapshot("snap-a", 3.0, 1)
|
||||
second = snapshot("snap-b", 4.0, 2)
|
||||
snapshot_path = tmp_path / "output" / "metrics" / "snap-a.yaml"
|
||||
history_path = tmp_path / "output" / "metrics" / "history.yaml"
|
||||
|
||||
write_snapshot(first, snapshot_path)
|
||||
append_to_history(first, history_path)
|
||||
append_to_history(second, history_path)
|
||||
|
||||
history = read_history(history_path)
|
||||
diff = find_snapshot(history, "snap-a").diff(find_snapshot(history, "2026-05-02"))
|
||||
|
||||
assert read_snapshot(snapshot_path) == first
|
||||
assert [item.snapshot_id for item in history] == ["snap-a", "snap-b"]
|
||||
assert get_history(tmp_path) == history
|
||||
assert metric_trend(history, "coverage_ratio") == [
|
||||
{"date": "2026-05-01T09:00:00+00:00", "value": 0.6},
|
||||
{"date": "2026-05-02T09:00:00+00:00", "value": 0.8},
|
||||
]
|
||||
assert diff.score_changes[0].artifact_id == "entity/division.md"
|
||||
assert diff.metric_changes[0].name == "coverage_ratio"
|
||||
|
||||
|
||||
def test_metrics_file_preserves_structured_values_and_recording_merges(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
infospace = create_infospace(tmp_path, "pilot", name="Pilot")
|
||||
source = tmp_path / "chapter.md"
|
||||
source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
|
||||
add_artifact(infospace.root, source, kind="source", title="Chapter")
|
||||
|
||||
config_path = infospace.root / "infospace.yaml"
|
||||
config = yaml.safe_load(config_path.read_text(encoding="utf-8"))
|
||||
config["viability"] = {
|
||||
"coverage_ratio": {"min": 0.5},
|
||||
"redundancy_ratio": {"max": 0.1},
|
||||
}
|
||||
config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")
|
||||
|
||||
metrics_path = infospace.root / "output" / "metrics" / "metrics.yaml"
|
||||
write_metrics_file(
|
||||
{"type_distribution": {"source": 1}, "manual_metric": 7, "rounded": 1.1234567},
|
||||
metrics_path,
|
||||
)
|
||||
|
||||
report = record_check_results(
|
||||
infospace.root,
|
||||
run_collection_checks(load_infospace(infospace.root).artifacts),
|
||||
artifact_evaluations=[evaluation()],
|
||||
)
|
||||
metrics = read_metrics_file(metrics_path)
|
||||
|
||||
assert metrics["type_distribution"] == {"source": 1}
|
||||
assert metrics["manual_metric"] == 7
|
||||
assert metrics["rounded"] == 1.123457
|
||||
assert metrics["per_artifact_mean"] == 4.5
|
||||
assert metrics["coverage_ratio"] == 1.0
|
||||
assert report.viability is not None
|
||||
assert report.viability["passed"] is True
|
||||
|
||||
|
||||
def test_cli_history_history_diff_and_metrics_json(tmp_path: Path) -> None:
|
||||
infospace = create_infospace(tmp_path, "pilot", name="Pilot")
|
||||
source = tmp_path / "chapter.md"
|
||||
source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
|
||||
add_artifact(infospace.root, source, kind="source", title="Chapter")
|
||||
history_path = infospace.root / "output" / "metrics" / "history.yaml"
|
||||
append_to_history(snapshot("snap-a", 3.0, 1), history_path)
|
||||
append_to_history(snapshot("snap-b", 4.0, 2), history_path)
|
||||
|
||||
history = subprocess.run(
|
||||
[sys.executable, "-m", "infospace_bench", "history", str(infospace.root)],
|
||||
check=False,
|
||||
env=cli_env(),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
)
|
||||
diff = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"infospace_bench",
|
||||
"history-diff",
|
||||
str(infospace.root),
|
||||
"snap-a",
|
||||
"snap-b",
|
||||
],
|
||||
check=False,
|
||||
env=cli_env(),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
)
|
||||
metrics = subprocess.run(
|
||||
[sys.executable, "-m", "infospace_bench", "metrics", str(infospace.root)],
|
||||
check=False,
|
||||
env=cli_env(),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
assert history.returncode == 0, history.stderr
|
||||
assert diff.returncode == 0, diff.stderr
|
||||
assert metrics.returncode == 0, metrics.stderr
|
||||
assert json.loads(history.stdout)["history"][1]["snapshot_id"] == "snap-b"
|
||||
assert json.loads(diff.stdout)["diff"]["after_id"] == "snap-b"
|
||||
assert json.loads(metrics.stdout)["snapshot"]["artifact_count"] == 1
|
||||
@@ -4,7 +4,7 @@ type: workplan
|
||||
title: "Evaluation History And Metrics Parity"
|
||||
domain: markitect
|
||||
repo: infospace-bench
|
||||
status: planned
|
||||
status: completed
|
||||
owner: markitect
|
||||
topic_slug: markitect
|
||||
created: "2026-05-14"
|
||||
@@ -26,7 +26,7 @@ infospace evaluation history and metrics behavior.
|
||||
|
||||
```task
|
||||
id: IB-WP-0008-T01
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "95b48ad3-c4d1-442c-9bc0-7591d948d23e"
|
||||
```
|
||||
@@ -39,7 +39,7 @@ state_hub_task_id: "95b48ad3-c4d1-442c-9bc0-7591d948d23e"
|
||||
|
||||
```task
|
||||
id: IB-WP-0008-T02
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "b4800ba8-5b86-44bb-bf47-e893bae36b22"
|
||||
```
|
||||
@@ -52,7 +52,7 @@ state_hub_task_id: "b4800ba8-5b86-44bb-bf47-e893bae36b22"
|
||||
|
||||
```task
|
||||
id: IB-WP-0008-T03
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "7abcbd63-0147-4ae8-85f0-4af51882f476"
|
||||
```
|
||||
@@ -65,7 +65,7 @@ state_hub_task_id: "7abcbd63-0147-4ae8-85f0-4af51882f476"
|
||||
|
||||
```task
|
||||
id: IB-WP-0008-T04
|
||||
status: todo
|
||||
status: done
|
||||
priority: medium
|
||||
state_hub_task_id: "675d1d45-39d9-4ddd-9ab7-5d7de8a0f601"
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user