Implement MetricsStore for project-scoped agent metrics.
Add ADR-004 storage layer with append-only executions, summary regeneration, idempotency keys, and retention pruning. Wire memory init to scaffold .kaizen/metrics/ by default and add unit tests.
This commit is contained in:
@@ -16,6 +16,7 @@ from .core import Agent, AgentConfig
|
|||||||
from .optimization import OptimizationLoop, PerformanceMetrics
|
from .optimization import OptimizationLoop, PerformanceMetrics
|
||||||
from .registry import AgentRegistry, AgentDefinition, AgentCategory
|
from .registry import AgentRegistry, AgentDefinition, AgentCategory
|
||||||
from .installer import AgentInstaller, ProjectInitializer, InstallationConfig
|
from .installer import AgentInstaller, ProjectInitializer, InstallationConfig
|
||||||
|
from .metrics import MetricsStore
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"Agent",
|
"Agent",
|
||||||
@@ -28,4 +29,5 @@ __all__ = [
|
|||||||
"AgentInstaller",
|
"AgentInstaller",
|
||||||
"ProjectInitializer",
|
"ProjectInitializer",
|
||||||
"InstallationConfig",
|
"InstallationConfig",
|
||||||
|
"MetricsStore",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from typing import List, Optional
|
|||||||
|
|
||||||
from .registry import AgentRegistry, AgentCategory
|
from .registry import AgentRegistry, AgentCategory
|
||||||
from .installer import AgentInstaller, ProjectInitializer, InstallationConfig
|
from .installer import AgentInstaller, ProjectInitializer, InstallationConfig
|
||||||
|
from .metrics import MetricsStore
|
||||||
|
|
||||||
|
|
||||||
def safe_cli_wrapper():
|
def safe_cli_wrapper():
|
||||||
@@ -781,7 +782,12 @@ def memory_show(agent_name: str, target: str):
|
|||||||
@memory.command("init")
|
@memory.command("init")
|
||||||
@click.argument("agent_name")
|
@click.argument("agent_name")
|
||||||
@click.option("--target", "-t", default=".", help="Project root (default: current)")
|
@click.option("--target", "-t", default=".", help="Project root (default: current)")
|
||||||
def memory_init(agent_name: str, target: str):
|
@click.option(
|
||||||
|
"--no-metrics",
|
||||||
|
is_flag=True,
|
||||||
|
help="Skip scaffolding .kaizen/metrics/<agent>/ (default: create metrics dir)",
|
||||||
|
)
|
||||||
|
def memory_init(agent_name: str, target: str, no_metrics: bool):
|
||||||
"""Scaffold an empty memory file for an agent."""
|
"""Scaffold an empty memory file for an agent."""
|
||||||
memory_path = _memory_path(target, agent_name)
|
memory_path = _memory_path(target, agent_name)
|
||||||
|
|
||||||
@@ -820,6 +826,10 @@ session_count: 0
|
|||||||
memory_path.write_text(content)
|
memory_path.write_text(content)
|
||||||
click.echo(f"Initialized memory for '{agent_name}': {memory_path}")
|
click.echo(f"Initialized memory for '{agent_name}': {memory_path}")
|
||||||
|
|
||||||
|
if not no_metrics:
|
||||||
|
metrics_dir = MetricsStore(Path(target), agent_name).scaffold()
|
||||||
|
click.echo(f"Initialized metrics for '{agent_name}': {metrics_dir}")
|
||||||
|
|
||||||
# For agents with protocols, note the protocol location
|
# For agents with protocols, note the protocol location
|
||||||
registry = _get_registry()
|
registry = _get_registry()
|
||||||
protocols_dir = registry.agents_dir / "protocols" / agent_name
|
protocols_dir = registry.agents_dir / "protocols" / agent_name
|
||||||
|
|||||||
208
src/kaizen_agentic/metrics.py
Normal file
208
src/kaizen_agentic/metrics.py
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
"""Project-scoped agent metrics storage (ADR-004)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
|
||||||
|
DEFAULT_RETENTION_DAYS = 180
|
||||||
|
|
||||||
|
|
||||||
|
def _utc_now_iso() -> str:
|
||||||
|
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_timestamp(value: str) -> datetime:
|
||||||
|
normalized = value.replace("Z", "+00:00")
|
||||||
|
return datetime.fromisoformat(normalized)
|
||||||
|
|
||||||
|
|
||||||
|
def _trend_direction(recent: List[float], prior: List[float]) -> str:
|
||||||
|
if not recent:
|
||||||
|
return "unknown"
|
||||||
|
if not prior:
|
||||||
|
return "stable"
|
||||||
|
recent_avg = sum(recent) / len(recent)
|
||||||
|
prior_avg = sum(prior) / len(prior)
|
||||||
|
delta = recent_avg - prior_avg
|
||||||
|
if abs(delta) < 0.05:
|
||||||
|
return "stable"
|
||||||
|
return "up" if delta > 0 else "down"
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MetricsStore:
|
||||||
|
"""Append-only per-agent execution metrics under .kaizen/metrics/."""
|
||||||
|
|
||||||
|
project_root: Path
|
||||||
|
agent_name: str
|
||||||
|
retention_days: int = DEFAULT_RETENTION_DAYS
|
||||||
|
|
||||||
|
def __post_init__(self) -> None:
|
||||||
|
self.project_root = Path(self.project_root).resolve()
|
||||||
|
self.agent_dir = self.project_root / ".kaizen" / "metrics" / self.agent_name
|
||||||
|
self.executions_path = self.agent_dir / "executions.jsonl"
|
||||||
|
self.summary_path = self.agent_dir / "summary.json"
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def list_agents(cls, project_root: Path) -> List[str]:
|
||||||
|
metrics_root = Path(project_root).resolve() / ".kaizen" / "metrics"
|
||||||
|
if not metrics_root.exists():
|
||||||
|
return []
|
||||||
|
agents = []
|
||||||
|
for child in sorted(metrics_root.iterdir()):
|
||||||
|
if child.is_dir() and (child / "executions.jsonl").exists():
|
||||||
|
agents.append(child.name)
|
||||||
|
return agents
|
||||||
|
|
||||||
|
def scaffold(self) -> Path:
|
||||||
|
"""Create metrics directory for this agent."""
|
||||||
|
self.agent_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
if not self.executions_path.exists():
|
||||||
|
self.executions_path.write_text("", encoding="utf-8")
|
||||||
|
return self.agent_dir
|
||||||
|
|
||||||
|
def append(
|
||||||
|
self,
|
||||||
|
record: Dict[str, Any],
|
||||||
|
*,
|
||||||
|
idempotency_key: Optional[str] = None,
|
||||||
|
) -> bool:
|
||||||
|
"""Append an execution record. Returns False if idempotency_key duplicates."""
|
||||||
|
self.scaffold()
|
||||||
|
|
||||||
|
payload = dict(record)
|
||||||
|
payload.setdefault("agent", self.agent_name)
|
||||||
|
payload.setdefault("timestamp", _utc_now_iso())
|
||||||
|
|
||||||
|
if idempotency_key is not None:
|
||||||
|
if self._has_idempotency_key(idempotency_key):
|
||||||
|
return False
|
||||||
|
payload["idempotency_key"] = idempotency_key
|
||||||
|
|
||||||
|
if "success" not in payload:
|
||||||
|
raise ValueError("execution record requires 'success' field")
|
||||||
|
|
||||||
|
with self.executions_path.open("a", encoding="utf-8") as handle:
|
||||||
|
handle.write(json.dumps(payload, sort_keys=True))
|
||||||
|
handle.write("\n")
|
||||||
|
|
||||||
|
self.prune()
|
||||||
|
self.write_summary()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def read_executions(self) -> List[Dict[str, Any]]:
|
||||||
|
if not self.executions_path.exists():
|
||||||
|
return []
|
||||||
|
records: List[Dict[str, Any]] = []
|
||||||
|
with self.executions_path.open(encoding="utf-8") as handle:
|
||||||
|
for line in handle:
|
||||||
|
line = line.strip()
|
||||||
|
if line:
|
||||||
|
records.append(json.loads(line))
|
||||||
|
return records
|
||||||
|
|
||||||
|
def summarise(self) -> Dict[str, Any]:
|
||||||
|
records = self.read_executions()
|
||||||
|
if not records:
|
||||||
|
return {
|
||||||
|
"agent": self.agent_name,
|
||||||
|
"execution_count": 0,
|
||||||
|
"success_rate": 0.0,
|
||||||
|
"avg_quality_score": 0.0,
|
||||||
|
"avg_execution_time_s": 0.0,
|
||||||
|
"last_execution": None,
|
||||||
|
"trend": {
|
||||||
|
"success_rate": "unknown",
|
||||||
|
"quality_score": "unknown",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
successes = [bool(r["success"]) for r in records]
|
||||||
|
success_rate = sum(successes) / len(successes)
|
||||||
|
|
||||||
|
quality_scores = [
|
||||||
|
float(r["quality_score"])
|
||||||
|
for r in records
|
||||||
|
if r.get("quality_score") is not None
|
||||||
|
]
|
||||||
|
execution_times = [
|
||||||
|
float(r["execution_time_s"])
|
||||||
|
for r in records
|
||||||
|
if r.get("execution_time_s") is not None
|
||||||
|
]
|
||||||
|
|
||||||
|
window = 5
|
||||||
|
recent_success = [1.0 if s else 0.0 for s in successes[-window:]]
|
||||||
|
prior_success = [1.0 if s else 0.0 for s in successes[:-window][-window:]]
|
||||||
|
recent_quality = quality_scores[-window:]
|
||||||
|
prior_quality = quality_scores[:-window][-window:] if len(quality_scores) > window else []
|
||||||
|
|
||||||
|
return {
|
||||||
|
"agent": self.agent_name,
|
||||||
|
"execution_count": len(records),
|
||||||
|
"success_rate": round(success_rate, 3),
|
||||||
|
"avg_quality_score": round(
|
||||||
|
sum(quality_scores) / len(quality_scores) if quality_scores else 0.0,
|
||||||
|
3,
|
||||||
|
),
|
||||||
|
"avg_execution_time_s": round(
|
||||||
|
sum(execution_times) / len(execution_times) if execution_times else 0.0,
|
||||||
|
3,
|
||||||
|
),
|
||||||
|
"last_execution": records[-1]["timestamp"],
|
||||||
|
"trend": {
|
||||||
|
"success_rate": _trend_direction(recent_success, prior_success),
|
||||||
|
"quality_score": _trend_direction(recent_quality, prior_quality),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def write_summary(self) -> Dict[str, Any]:
|
||||||
|
summary = self.summarise()
|
||||||
|
self.agent_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
self.summary_path.write_text(
|
||||||
|
json.dumps(summary, indent=2, sort_keys=True) + "\n",
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
return summary
|
||||||
|
|
||||||
|
def read_summary(self) -> Optional[Dict[str, Any]]:
|
||||||
|
if not self.summary_path.exists():
|
||||||
|
return None
|
||||||
|
return json.loads(self.summary_path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
def prune(self) -> int:
|
||||||
|
"""Drop execution records older than retention_days. Returns removed count."""
|
||||||
|
if not self.executions_path.exists():
|
||||||
|
return 0
|
||||||
|
|
||||||
|
cutoff = datetime.now(timezone.utc) - timedelta(days=self.retention_days)
|
||||||
|
kept: List[Dict[str, Any]] = []
|
||||||
|
removed = 0
|
||||||
|
|
||||||
|
for record in self.read_executions():
|
||||||
|
try:
|
||||||
|
ts = _parse_timestamp(record["timestamp"])
|
||||||
|
except (KeyError, ValueError):
|
||||||
|
kept.append(record)
|
||||||
|
continue
|
||||||
|
if ts >= cutoff:
|
||||||
|
kept.append(record)
|
||||||
|
else:
|
||||||
|
removed += 1
|
||||||
|
|
||||||
|
if removed:
|
||||||
|
with self.executions_path.open("w", encoding="utf-8") as handle:
|
||||||
|
for record in kept:
|
||||||
|
handle.write(json.dumps(record, sort_keys=True))
|
||||||
|
handle.write("\n")
|
||||||
|
self.write_summary()
|
||||||
|
|
||||||
|
return removed
|
||||||
|
|
||||||
|
def _has_idempotency_key(self, key: str) -> bool:
|
||||||
|
return any(r.get("idempotency_key") == key for r in self.read_executions())
|
||||||
107
tests/test_metrics.py
Normal file
107
tests/test_metrics.py
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
"""Tests for project-scoped metrics storage (ADR-004)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from kaizen_agentic.metrics import MetricsStore, DEFAULT_RETENTION_DAYS
|
||||||
|
|
||||||
|
|
||||||
|
def _old_timestamp(days: int) -> str:
|
||||||
|
dt = datetime.now(timezone.utc) - timedelta(days=days)
|
||||||
|
return dt.strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def project_dir(tmp_path: Path) -> Path:
|
||||||
|
root = tmp_path / "demo-project"
|
||||||
|
root.mkdir()
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
class TestMetricsStore:
|
||||||
|
def test_scaffold_creates_directory_and_empty_executions(self, project_dir: Path):
|
||||||
|
store = MetricsStore(project_dir, "tdd-workflow")
|
||||||
|
path = store.scaffold()
|
||||||
|
|
||||||
|
assert path == project_dir / ".kaizen" / "metrics" / "tdd-workflow"
|
||||||
|
assert store.executions_path.exists()
|
||||||
|
assert store.executions_path.read_text() == ""
|
||||||
|
|
||||||
|
def test_append_and_read_executions(self, project_dir: Path):
|
||||||
|
store = MetricsStore(project_dir, "tdd-workflow")
|
||||||
|
|
||||||
|
assert store.append({"success": True, "quality_score": 0.9}) is True
|
||||||
|
assert store.append({"success": False, "execution_time_s": 12.5}) is True
|
||||||
|
|
||||||
|
records = store.read_executions()
|
||||||
|
assert len(records) == 2
|
||||||
|
assert records[0]["agent"] == "tdd-workflow"
|
||||||
|
assert records[0]["success"] is True
|
||||||
|
assert "timestamp" in records[0]
|
||||||
|
|
||||||
|
def test_idempotency_key_rejects_duplicate(self, project_dir: Path):
|
||||||
|
store = MetricsStore(project_dir, "coach")
|
||||||
|
|
||||||
|
assert store.append({"success": True}, idempotency_key="sess-1") is True
|
||||||
|
assert store.append({"success": True}, idempotency_key="sess-1") is False
|
||||||
|
assert len(store.read_executions()) == 1
|
||||||
|
|
||||||
|
def test_write_summary_regenerates_summary_json(self, project_dir: Path):
|
||||||
|
store = MetricsStore(project_dir, "tdd-workflow")
|
||||||
|
store.append({"success": True, "quality_score": 0.8, "execution_time_s": 10})
|
||||||
|
store.append({"success": True, "quality_score": 1.0, "execution_time_s": 20})
|
||||||
|
|
||||||
|
summary = store.write_summary()
|
||||||
|
|
||||||
|
assert summary["execution_count"] == 2
|
||||||
|
assert summary["success_rate"] == 1.0
|
||||||
|
assert summary["avg_quality_score"] == 0.9
|
||||||
|
assert summary["avg_execution_time_s"] == 15.0
|
||||||
|
assert store.summary_path.exists()
|
||||||
|
on_disk = json.loads(store.summary_path.read_text())
|
||||||
|
assert on_disk["execution_count"] == 2
|
||||||
|
|
||||||
|
def test_prune_removes_expired_records(self, project_dir: Path):
|
||||||
|
store = MetricsStore(project_dir, "tdd-workflow", retention_days=30)
|
||||||
|
store.scaffold()
|
||||||
|
|
||||||
|
old = {
|
||||||
|
"timestamp": _old_timestamp(45),
|
||||||
|
"agent": "tdd-workflow",
|
||||||
|
"success": False,
|
||||||
|
}
|
||||||
|
recent = {
|
||||||
|
"timestamp": _old_timestamp(1),
|
||||||
|
"agent": "tdd-workflow",
|
||||||
|
"success": True,
|
||||||
|
"quality_score": 0.7,
|
||||||
|
}
|
||||||
|
with store.executions_path.open("w", encoding="utf-8") as handle:
|
||||||
|
handle.write(json.dumps(old) + "\n")
|
||||||
|
handle.write(json.dumps(recent) + "\n")
|
||||||
|
|
||||||
|
removed = store.prune()
|
||||||
|
|
||||||
|
assert removed == 1
|
||||||
|
records = store.read_executions()
|
||||||
|
assert len(records) == 1
|
||||||
|
assert records[0]["success"] is True
|
||||||
|
summary = store.read_summary()
|
||||||
|
assert summary is not None
|
||||||
|
assert summary["execution_count"] == 1
|
||||||
|
|
||||||
|
def test_list_agents_with_metrics(self, project_dir: Path):
|
||||||
|
MetricsStore(project_dir, "tdd-workflow").scaffold()
|
||||||
|
MetricsStore(project_dir, "coach").append({"success": True})
|
||||||
|
|
||||||
|
agents = MetricsStore.list_agents(project_dir)
|
||||||
|
|
||||||
|
assert agents == ["coach", "tdd-workflow"]
|
||||||
|
|
||||||
|
def test_default_retention_matches_adr(self):
|
||||||
|
assert DEFAULT_RETENTION_DAYS == 180
|
||||||
@@ -85,9 +85,9 @@ Optimizer-specific aggregates (per `wiki/AgentKaizenOptimizer.md`):
|
|||||||
### Tasks
|
### Tasks
|
||||||
|
|
||||||
- [x] T01 — Write ADR-004: project metrics convention (location, schema, lifecycle, retention, Helix Forge correlation)
|
- [x] T01 — Write ADR-004: project metrics convention (location, schema, lifecycle, retention, Helix Forge correlation)
|
||||||
- [ ] T02 — Implement `MetricsStore` in `src/kaizen_agentic/metrics.py` (append, read, summarise, prune by retention)
|
- [x] T02 — Implement `MetricsStore` in `src/kaizen_agentic/metrics.py` (append, read, summarise, prune by retention)
|
||||||
- [ ] T03 — Add `memory init` hook to scaffold `.kaizen/metrics/<agent>/` alongside memory (optional flag `--no-metrics`)
|
- [x] T03 — Add `memory init` hook to scaffold `.kaizen/metrics/<agent>/` alongside memory (optional flag `--no-metrics`)
|
||||||
- [ ] T04 — Unit tests for `MetricsStore` (append idempotency key, summary regeneration, retention prune)
|
- [x] T04 — Unit tests for `MetricsStore` (append idempotency key, summary regeneration, retention prune)
|
||||||
|
|
||||||
### Definition of done
|
### Definition of done
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user