Add metrics CLI for project-scoped agent performance records.
Implement record, show, list, and export commands; document session-close protocol template; extend cheat sheet and agency-framework docs; add CLI tests.
This commit is contained in:
@@ -48,6 +48,27 @@ kaizen-agentic status # Show current project status
|
|||||||
kaizen-agentic validate # Validate agent installation
|
kaizen-agentic validate # Validate agent installation
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Project Metrics (ADR-004)
|
||||||
|
```bash
|
||||||
|
# Record outcome at session close
|
||||||
|
kaizen-agentic metrics record tdd-workflow --success --time 120 --quality 0.9
|
||||||
|
kaizen-agentic metrics record tdd-workflow --failure --time 45
|
||||||
|
|
||||||
|
# Full JSON record from stdin
|
||||||
|
echo '{"success": true, "quality_score": 1.0}' | kaizen-agentic metrics record tdd-workflow --json
|
||||||
|
|
||||||
|
# Inspect metrics
|
||||||
|
kaizen-agentic metrics show tdd-workflow
|
||||||
|
kaizen-agentic metrics list
|
||||||
|
kaizen-agentic metrics export tdd-workflow
|
||||||
|
|
||||||
|
# Scaffold memory + metrics together
|
||||||
|
kaizen-agentic memory init tdd-workflow
|
||||||
|
kaizen-agentic memory init tdd-workflow --no-metrics # memory only
|
||||||
|
```
|
||||||
|
|
||||||
|
Session-close template: `docs/templates/session-close-protocol.md`
|
||||||
|
|
||||||
### Information
|
### Information
|
||||||
```bash
|
```bash
|
||||||
# List templates
|
# List templates
|
||||||
|
|||||||
@@ -265,6 +265,10 @@ kaizen-agentic metrics optimize [agent] # Run optimizer on project metrics
|
|||||||
`memory brief` includes a `## Performance Summary` when metrics exist (WP-0003
|
`memory brief` includes a `## Performance Summary` when metrics exist (WP-0003
|
||||||
Part 4).
|
Part 4).
|
||||||
|
|
||||||
|
`memory init` scaffolds `.kaizen/metrics/<agent>/` by default (`--no-metrics` to
|
||||||
|
skip). Record outcomes at session close per
|
||||||
|
[session-close protocol template](templates/session-close-protocol.md).
|
||||||
|
|
||||||
### Fleet correlation
|
### Fleet correlation
|
||||||
|
|
||||||
Project metrics correlate with **Helix Forge** fleet session metrics in
|
Project metrics correlate with **Helix Forge** fleet session metrics in
|
||||||
|
|||||||
33
docs/templates/session-close-protocol.md
vendored
Normal file
33
docs/templates/session-close-protocol.md
vendored
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
# Session-Close Protocol Template
|
||||||
|
|
||||||
|
Reference template for memory-enabled agents. Copy the **Session Close** block
|
||||||
|
into `agents/agent-<name>.md` and adapt the metrics line to the agent.
|
||||||
|
|
||||||
|
## Session Close
|
||||||
|
|
||||||
|
1. Update `## Accumulated Findings`, `## What Worked`, and `## Watch Points` as needed.
|
||||||
|
2. Append one line to `## Session Log`: `YYYY-MM-DD · <summary> · <outcome>`.
|
||||||
|
3. Bump `last_updated` to today and increment `session_count` in memory frontmatter.
|
||||||
|
4. Record session metrics (adjust flags to match outcome):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
kaizen-agentic metrics record <agent-name> --success --time <seconds> --quality <0.0-1.0>
|
||||||
|
# or on failure:
|
||||||
|
kaizen-agentic metrics record <agent-name> --failure --time <seconds>
|
||||||
|
```
|
||||||
|
|
||||||
|
Optional: pass a full JSON record (ADR-004 schema) via stdin:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
echo '{"success": true, "quality_score": 0.9, "primary_metric": {"name": "...", "value": 1.0, "target": 1.0}}' \
|
||||||
|
| kaizen-agentic metrics record <agent-name> --json
|
||||||
|
```
|
||||||
|
|
||||||
|
Use `--idempotency-key <session-id>` to avoid duplicate records if the close
|
||||||
|
protocol runs more than once for the same session.
|
||||||
|
|
||||||
|
## Pilot agents
|
||||||
|
|
||||||
|
`tdd-workflow` is the reference implementation (WP-0003 Part 5). Other
|
||||||
|
memory-enabled agents should adopt this block as the metrics CLI becomes available
|
||||||
|
in their workflows.
|
||||||
@@ -1,5 +1,6 @@
|
|||||||
"""Command-line interface for Kaizen Agentic agent management."""
|
"""Command-line interface for Kaizen Agentic agent management."""
|
||||||
|
|
||||||
|
import json
|
||||||
import sys
|
import sys
|
||||||
import subprocess
|
import subprocess
|
||||||
import contextlib
|
import contextlib
|
||||||
@@ -938,6 +939,118 @@ def memory_clear(agent_name: str, target: str):
|
|||||||
memory_path.parent.rmdir()
|
memory_path.parent.rmdir()
|
||||||
|
|
||||||
|
|
||||||
|
@cli.group()
|
||||||
|
def metrics():
|
||||||
|
"""Manage project-scoped agent metrics (.kaizen/metrics/<agent>/)."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@metrics.command("record")
|
||||||
|
@click.argument("agent_name")
|
||||||
|
@click.option("--target", "-t", default=".", help="Project root (default: current)")
|
||||||
|
@click.option("--success", "outcome_success", is_flag=True, help="Record successful execution")
|
||||||
|
@click.option("--failure", "outcome_failure", is_flag=True, help="Record failed execution")
|
||||||
|
@click.option("--time", "execution_time", type=float, help="Execution time in seconds")
|
||||||
|
@click.option("--quality", type=float, help="Quality score 0.0–1.0")
|
||||||
|
@click.option("--session-id", help="Optional session identifier")
|
||||||
|
@click.option("--idempotency-key", help="Skip append if this key was already recorded")
|
||||||
|
@click.option("--json", "json_input", is_flag=True, help="Read full record JSON from stdin")
|
||||||
|
def metrics_record(
|
||||||
|
agent_name: str,
|
||||||
|
target: str,
|
||||||
|
outcome_success: bool,
|
||||||
|
outcome_failure: bool,
|
||||||
|
execution_time: Optional[float],
|
||||||
|
quality: Optional[float],
|
||||||
|
session_id: Optional[str],
|
||||||
|
idempotency_key: Optional[str],
|
||||||
|
json_input: bool,
|
||||||
|
):
|
||||||
|
"""Append one execution record for an agent."""
|
||||||
|
store = MetricsStore(_project_root(target), agent_name)
|
||||||
|
|
||||||
|
if json_input:
|
||||||
|
payload = json.load(sys.stdin)
|
||||||
|
if not isinstance(payload, dict):
|
||||||
|
click.echo("Error: JSON input must be an object", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
if outcome_success and outcome_failure:
|
||||||
|
click.echo("Error: use only one of --success or --failure", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
if not outcome_success and not outcome_failure:
|
||||||
|
click.echo("Error: specify --success or --failure (or use --json)", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
payload = {"success": outcome_success}
|
||||||
|
if execution_time is not None:
|
||||||
|
payload["execution_time_s"] = execution_time
|
||||||
|
if quality is not None:
|
||||||
|
payload["quality_score"] = quality
|
||||||
|
if session_id:
|
||||||
|
payload["session_id"] = session_id
|
||||||
|
|
||||||
|
if store.append(payload, idempotency_key=idempotency_key):
|
||||||
|
click.echo(f"Recorded metrics for '{agent_name}'")
|
||||||
|
else:
|
||||||
|
click.echo(f"Skipped duplicate record for '{agent_name}' (idempotency key exists)")
|
||||||
|
|
||||||
|
|
||||||
|
@metrics.command("show")
|
||||||
|
@click.argument("agent_name")
|
||||||
|
@click.option("--target", "-t", default=".", help="Project root (default: current)")
|
||||||
|
@click.option("--limit", "-n", default=5, show_default=True, help="Recent executions to show")
|
||||||
|
def metrics_show(agent_name: str, target: str, limit: int):
|
||||||
|
"""Print metrics summary and recent executions for an agent."""
|
||||||
|
store = MetricsStore(_project_root(target), agent_name)
|
||||||
|
|
||||||
|
if not store.executions_path.exists():
|
||||||
|
click.echo(f"No metrics found for agent '{agent_name}'.")
|
||||||
|
click.echo(f" Expected: {store.agent_dir}")
|
||||||
|
click.echo(f" Run: kaizen-agentic memory init {agent_name}")
|
||||||
|
return
|
||||||
|
|
||||||
|
summary = store.read_summary() or store.write_summary()
|
||||||
|
click.echo(f"Metrics for '{agent_name}':")
|
||||||
|
click.echo("=" * 40)
|
||||||
|
click.echo(json.dumps(summary, indent=2))
|
||||||
|
|
||||||
|
records = store.read_executions()
|
||||||
|
if records:
|
||||||
|
click.echo("\nRecent executions:")
|
||||||
|
for record in records[-limit:]:
|
||||||
|
click.echo(json.dumps(record, sort_keys=True))
|
||||||
|
|
||||||
|
|
||||||
|
@metrics.command("list")
|
||||||
|
@click.option("--target", "-t", default=".", help="Project root (default: current)")
|
||||||
|
def metrics_list(target: str):
|
||||||
|
"""List agents with metrics in the current project."""
|
||||||
|
agents = MetricsStore.list_agents(_project_root(target))
|
||||||
|
if not agents:
|
||||||
|
click.echo("No agent metrics found in this project.")
|
||||||
|
click.echo(" Run: kaizen-agentic memory init <agent>")
|
||||||
|
return
|
||||||
|
|
||||||
|
click.echo("Agents with metrics:")
|
||||||
|
for name in agents:
|
||||||
|
store = MetricsStore(_project_root(target), name)
|
||||||
|
summary = store.read_summary()
|
||||||
|
count = summary["execution_count"] if summary else len(store.read_executions())
|
||||||
|
click.echo(f" • {name} ({count} executions)")
|
||||||
|
|
||||||
|
|
||||||
|
@metrics.command("export")
|
||||||
|
@click.argument("agent_name")
|
||||||
|
@click.option("--target", "-t", default=".", help="Project root (default: current)")
|
||||||
|
def metrics_export(agent_name: str, target: str):
|
||||||
|
"""Dump executions.jsonl for an agent to stdout."""
|
||||||
|
store = MetricsStore(_project_root(target), agent_name)
|
||||||
|
if not store.executions_path.exists():
|
||||||
|
click.echo(f"No metrics found for agent '{agent_name}'.", err=True)
|
||||||
|
sys.exit(1)
|
||||||
|
click.echo(store.executions_path.read_text(encoding="utf-8"), nl=False)
|
||||||
|
|
||||||
|
|
||||||
@cli.group()
|
@cli.group()
|
||||||
def protocols():
|
def protocols():
|
||||||
"""Browse agent protocol runbooks (agents/protocols/<agent>/<slug>.md)."""
|
"""Browse agent protocol runbooks (agents/protocols/<agent>/<slug>.md)."""
|
||||||
@@ -1011,8 +1124,12 @@ def protocols_show(agent_name: str, slug: str):
|
|||||||
click.echo(protocol_path.read_text())
|
click.echo(protocol_path.read_text())
|
||||||
|
|
||||||
|
|
||||||
|
def _project_root(target: str) -> Path:
|
||||||
|
return Path(target).resolve()
|
||||||
|
|
||||||
|
|
||||||
def _memory_path(target: str, agent_name: str) -> Path:
|
def _memory_path(target: str, agent_name: str) -> Path:
|
||||||
return Path(target).resolve() / ".kaizen" / "agents" / agent_name / "memory.md"
|
return _project_root(target) / ".kaizen" / "agents" / agent_name / "memory.md"
|
||||||
|
|
||||||
|
|
||||||
def _today() -> str:
|
def _today() -> str:
|
||||||
|
|||||||
123
tests/test_metrics_cli.py
Normal file
123
tests/test_metrics_cli.py
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
"""CLI tests for project-scoped metrics commands."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from click.testing import CliRunner
|
||||||
|
|
||||||
|
from kaizen_agentic.cli import cli
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def runner() -> CliRunner:
|
||||||
|
return CliRunner()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def project_dir(tmp_path: Path) -> Path:
|
||||||
|
root = tmp_path / "demo-project"
|
||||||
|
root.mkdir()
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
class TestMetricsCli:
|
||||||
|
def test_record_show_list_export_flow(self, runner: CliRunner, project_dir: Path):
|
||||||
|
target = str(project_dir)
|
||||||
|
|
||||||
|
record = runner.invoke(
|
||||||
|
cli,
|
||||||
|
[
|
||||||
|
"metrics",
|
||||||
|
"record",
|
||||||
|
"tdd-workflow",
|
||||||
|
"--target",
|
||||||
|
target,
|
||||||
|
"--success",
|
||||||
|
"--time",
|
||||||
|
"42",
|
||||||
|
"--quality",
|
||||||
|
"0.85",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
assert record.exit_code == 0
|
||||||
|
assert "Recorded metrics" in record.output
|
||||||
|
|
||||||
|
show = runner.invoke(cli, ["metrics", "show", "tdd-workflow", "--target", target])
|
||||||
|
assert show.exit_code == 0
|
||||||
|
assert '"execution_count": 1' in show.output
|
||||||
|
assert '"success": true' in show.output
|
||||||
|
|
||||||
|
listed = runner.invoke(cli, ["metrics", "list", "--target", target])
|
||||||
|
assert listed.exit_code == 0
|
||||||
|
assert "tdd-workflow" in listed.output
|
||||||
|
|
||||||
|
export = runner.invoke(cli, ["metrics", "export", "tdd-workflow", "--target", target])
|
||||||
|
assert export.exit_code == 0
|
||||||
|
lines = [line for line in export.output.splitlines() if line.strip()]
|
||||||
|
assert len(lines) == 1
|
||||||
|
assert json.loads(lines[0])["quality_score"] == 0.85
|
||||||
|
|
||||||
|
def test_record_json_from_stdin(self, runner: CliRunner, project_dir: Path):
|
||||||
|
payload = json.dumps({"success": False, "execution_time_s": 9.5})
|
||||||
|
result = runner.invoke(
|
||||||
|
cli,
|
||||||
|
["metrics", "record", "coach", "--target", str(project_dir), "--json"],
|
||||||
|
input=payload,
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
|
||||||
|
show = runner.invoke(cli, ["metrics", "show", "coach", "--target", str(project_dir)])
|
||||||
|
assert '"success": false' in show.output
|
||||||
|
|
||||||
|
def test_record_idempotency_key_skips_duplicate(
|
||||||
|
self, runner: CliRunner, project_dir: Path
|
||||||
|
):
|
||||||
|
args = [
|
||||||
|
"metrics",
|
||||||
|
"record",
|
||||||
|
"coach",
|
||||||
|
"--target",
|
||||||
|
str(project_dir),
|
||||||
|
"--success",
|
||||||
|
"--idempotency-key",
|
||||||
|
"sess-abc",
|
||||||
|
]
|
||||||
|
first = runner.invoke(cli, args)
|
||||||
|
second = runner.invoke(cli, args)
|
||||||
|
assert first.exit_code == 0
|
||||||
|
assert second.exit_code == 0
|
||||||
|
assert "Skipped duplicate" in second.output
|
||||||
|
|
||||||
|
export = runner.invoke(
|
||||||
|
cli, ["metrics", "export", "coach", "--target", str(project_dir)]
|
||||||
|
)
|
||||||
|
assert len(export.output.strip().splitlines()) == 1
|
||||||
|
|
||||||
|
def test_record_requires_outcome_without_json(self, runner: CliRunner, project_dir: Path):
|
||||||
|
result = runner.invoke(
|
||||||
|
cli,
|
||||||
|
["metrics", "record", "tdd-workflow", "--target", str(project_dir)],
|
||||||
|
)
|
||||||
|
assert result.exit_code != 0
|
||||||
|
assert "--success or --failure" in result.output
|
||||||
|
|
||||||
|
def test_memory_init_scaffolds_metrics(self, runner: CliRunner, project_dir: Path):
|
||||||
|
result = runner.invoke(
|
||||||
|
cli,
|
||||||
|
["memory", "init", "tdd-workflow", "--target", str(project_dir)],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
metrics_dir = project_dir / ".kaizen" / "metrics" / "tdd-workflow"
|
||||||
|
assert metrics_dir.exists()
|
||||||
|
assert (metrics_dir / "executions.jsonl").exists()
|
||||||
|
|
||||||
|
def test_memory_init_no_metrics_flag(self, runner: CliRunner, project_dir: Path):
|
||||||
|
result = runner.invoke(
|
||||||
|
cli,
|
||||||
|
["memory", "init", "coach", "--target", str(project_dir), "--no-metrics"],
|
||||||
|
)
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert not (project_dir / ".kaizen" / "metrics" / "coach").exists()
|
||||||
@@ -121,10 +121,10 @@ kaizen-agentic metrics export <agent> # Dump executions.jsonl to stdout
|
|||||||
|
|
||||||
### Tasks
|
### Tasks
|
||||||
|
|
||||||
- [ ] T05 — Implement `metrics` CLI command group (record, show, list, export)
|
- [x] T05 — Implement `metrics` CLI command group (record, show, list, export)
|
||||||
- [ ] T06 — Integrate `metrics record` into session-close protocol template for pilot agents
|
- [x] T06 — Integrate `metrics record` into session-close protocol template for pilot agents
|
||||||
- [ ] T07 — CLI tests for metrics commands (click.testing, temp project dir)
|
- [x] T07 — CLI tests for metrics commands (click.testing, temp project dir)
|
||||||
- [ ] T08 — Update `docs/CLI_CHEAT_SHEET.md` and `docs/agency-framework.md` with metrics section
|
- [x] T08 — Update `docs/CLI_CHEAT_SHEET.md` and `docs/agency-framework.md` with metrics section
|
||||||
|
|
||||||
### Definition of done
|
### Definition of done
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user