diff --git a/docs/CLI_CHEAT_SHEET.md b/docs/CLI_CHEAT_SHEET.md index cf0586f..da00986 100644 --- a/docs/CLI_CHEAT_SHEET.md +++ b/docs/CLI_CHEAT_SHEET.md @@ -48,6 +48,27 @@ kaizen-agentic status # Show current project status kaizen-agentic validate # Validate agent installation ``` +### Project Metrics (ADR-004) +```bash +# Record outcome at session close +kaizen-agentic metrics record tdd-workflow --success --time 120 --quality 0.9 +kaizen-agentic metrics record tdd-workflow --failure --time 45 + +# Full JSON record from stdin +echo '{"success": true, "quality_score": 1.0}' | kaizen-agentic metrics record tdd-workflow --json + +# Inspect metrics +kaizen-agentic metrics show tdd-workflow +kaizen-agentic metrics list +kaizen-agentic metrics export tdd-workflow + +# Scaffold memory + metrics together +kaizen-agentic memory init tdd-workflow +kaizen-agentic memory init tdd-workflow --no-metrics # memory only +``` + +Session-close template: `docs/templates/session-close-protocol.md` + ### Information ```bash # List templates diff --git a/docs/agency-framework.md b/docs/agency-framework.md index 72a684f..a44dae5 100644 --- a/docs/agency-framework.md +++ b/docs/agency-framework.md @@ -265,6 +265,10 @@ kaizen-agentic metrics optimize [agent] # Run optimizer on project metrics `memory brief` includes a `## Performance Summary` when metrics exist (WP-0003 Part 4). +`memory init` scaffolds `.kaizen/metrics//` by default (`--no-metrics` to +skip). Record outcomes at session close per +[session-close protocol template](templates/session-close-protocol.md). + ### Fleet correlation Project metrics correlate with **Helix Forge** fleet session metrics in diff --git a/docs/templates/session-close-protocol.md b/docs/templates/session-close-protocol.md new file mode 100644 index 0000000..4e005ba --- /dev/null +++ b/docs/templates/session-close-protocol.md @@ -0,0 +1,33 @@ +# Session-Close Protocol Template + +Reference template for memory-enabled agents. Copy the **Session Close** block +into `agents/agent-.md` and adapt the metrics line to the agent. + +## Session Close + +1. Update `## Accumulated Findings`, `## What Worked`, and `## Watch Points` as needed. +2. Append one line to `## Session Log`: `YYYY-MM-DD · · `. +3. Bump `last_updated` to today and increment `session_count` in memory frontmatter. +4. Record session metrics (adjust flags to match outcome): + +```bash +kaizen-agentic metrics record --success --time --quality <0.0-1.0> +# or on failure: +kaizen-agentic metrics record --failure --time +``` + +Optional: pass a full JSON record (ADR-004 schema) via stdin: + +```bash +echo '{"success": true, "quality_score": 0.9, "primary_metric": {"name": "...", "value": 1.0, "target": 1.0}}' \ + | kaizen-agentic metrics record --json +``` + +Use `--idempotency-key ` to avoid duplicate records if the close +protocol runs more than once for the same session. + +## Pilot agents + +`tdd-workflow` is the reference implementation (WP-0003 Part 5). Other +memory-enabled agents should adopt this block as the metrics CLI becomes available +in their workflows. \ No newline at end of file diff --git a/src/kaizen_agentic/cli.py b/src/kaizen_agentic/cli.py index ef80a65..776597f 100644 --- a/src/kaizen_agentic/cli.py +++ b/src/kaizen_agentic/cli.py @@ -1,5 +1,6 @@ """Command-line interface for Kaizen Agentic agent management.""" +import json import sys import subprocess import contextlib @@ -938,6 +939,118 @@ def memory_clear(agent_name: str, target: str): memory_path.parent.rmdir() +@cli.group() +def metrics(): + """Manage project-scoped agent metrics (.kaizen/metrics//).""" + pass + + +@metrics.command("record") +@click.argument("agent_name") +@click.option("--target", "-t", default=".", help="Project root (default: current)") +@click.option("--success", "outcome_success", is_flag=True, help="Record successful execution") +@click.option("--failure", "outcome_failure", is_flag=True, help="Record failed execution") +@click.option("--time", "execution_time", type=float, help="Execution time in seconds") +@click.option("--quality", type=float, help="Quality score 0.0–1.0") +@click.option("--session-id", help="Optional session identifier") +@click.option("--idempotency-key", help="Skip append if this key was already recorded") +@click.option("--json", "json_input", is_flag=True, help="Read full record JSON from stdin") +def metrics_record( + agent_name: str, + target: str, + outcome_success: bool, + outcome_failure: bool, + execution_time: Optional[float], + quality: Optional[float], + session_id: Optional[str], + idempotency_key: Optional[str], + json_input: bool, +): + """Append one execution record for an agent.""" + store = MetricsStore(_project_root(target), agent_name) + + if json_input: + payload = json.load(sys.stdin) + if not isinstance(payload, dict): + click.echo("Error: JSON input must be an object", err=True) + sys.exit(1) + else: + if outcome_success and outcome_failure: + click.echo("Error: use only one of --success or --failure", err=True) + sys.exit(1) + if not outcome_success and not outcome_failure: + click.echo("Error: specify --success or --failure (or use --json)", err=True) + sys.exit(1) + payload = {"success": outcome_success} + if execution_time is not None: + payload["execution_time_s"] = execution_time + if quality is not None: + payload["quality_score"] = quality + if session_id: + payload["session_id"] = session_id + + if store.append(payload, idempotency_key=idempotency_key): + click.echo(f"Recorded metrics for '{agent_name}'") + else: + click.echo(f"Skipped duplicate record for '{agent_name}' (idempotency key exists)") + + +@metrics.command("show") +@click.argument("agent_name") +@click.option("--target", "-t", default=".", help="Project root (default: current)") +@click.option("--limit", "-n", default=5, show_default=True, help="Recent executions to show") +def metrics_show(agent_name: str, target: str, limit: int): + """Print metrics summary and recent executions for an agent.""" + store = MetricsStore(_project_root(target), agent_name) + + if not store.executions_path.exists(): + click.echo(f"No metrics found for agent '{agent_name}'.") + click.echo(f" Expected: {store.agent_dir}") + click.echo(f" Run: kaizen-agentic memory init {agent_name}") + return + + summary = store.read_summary() or store.write_summary() + click.echo(f"Metrics for '{agent_name}':") + click.echo("=" * 40) + click.echo(json.dumps(summary, indent=2)) + + records = store.read_executions() + if records: + click.echo("\nRecent executions:") + for record in records[-limit:]: + click.echo(json.dumps(record, sort_keys=True)) + + +@metrics.command("list") +@click.option("--target", "-t", default=".", help="Project root (default: current)") +def metrics_list(target: str): + """List agents with metrics in the current project.""" + agents = MetricsStore.list_agents(_project_root(target)) + if not agents: + click.echo("No agent metrics found in this project.") + click.echo(" Run: kaizen-agentic memory init ") + return + + click.echo("Agents with metrics:") + for name in agents: + store = MetricsStore(_project_root(target), name) + summary = store.read_summary() + count = summary["execution_count"] if summary else len(store.read_executions()) + click.echo(f" • {name} ({count} executions)") + + +@metrics.command("export") +@click.argument("agent_name") +@click.option("--target", "-t", default=".", help="Project root (default: current)") +def metrics_export(agent_name: str, target: str): + """Dump executions.jsonl for an agent to stdout.""" + store = MetricsStore(_project_root(target), agent_name) + if not store.executions_path.exists(): + click.echo(f"No metrics found for agent '{agent_name}'.", err=True) + sys.exit(1) + click.echo(store.executions_path.read_text(encoding="utf-8"), nl=False) + + @cli.group() def protocols(): """Browse agent protocol runbooks (agents/protocols//.md).""" @@ -1011,8 +1124,12 @@ def protocols_show(agent_name: str, slug: str): click.echo(protocol_path.read_text()) +def _project_root(target: str) -> Path: + return Path(target).resolve() + + def _memory_path(target: str, agent_name: str) -> Path: - return Path(target).resolve() / ".kaizen" / "agents" / agent_name / "memory.md" + return _project_root(target) / ".kaizen" / "agents" / agent_name / "memory.md" def _today() -> str: diff --git a/tests/test_metrics_cli.py b/tests/test_metrics_cli.py new file mode 100644 index 0000000..f811b54 --- /dev/null +++ b/tests/test_metrics_cli.py @@ -0,0 +1,123 @@ +"""CLI tests for project-scoped metrics commands.""" + +from __future__ import annotations + +import json +from pathlib import Path + +import pytest +from click.testing import CliRunner + +from kaizen_agentic.cli import cli + + +@pytest.fixture +def runner() -> CliRunner: + return CliRunner() + + +@pytest.fixture +def project_dir(tmp_path: Path) -> Path: + root = tmp_path / "demo-project" + root.mkdir() + return root + + +class TestMetricsCli: + def test_record_show_list_export_flow(self, runner: CliRunner, project_dir: Path): + target = str(project_dir) + + record = runner.invoke( + cli, + [ + "metrics", + "record", + "tdd-workflow", + "--target", + target, + "--success", + "--time", + "42", + "--quality", + "0.85", + ], + ) + assert record.exit_code == 0 + assert "Recorded metrics" in record.output + + show = runner.invoke(cli, ["metrics", "show", "tdd-workflow", "--target", target]) + assert show.exit_code == 0 + assert '"execution_count": 1' in show.output + assert '"success": true' in show.output + + listed = runner.invoke(cli, ["metrics", "list", "--target", target]) + assert listed.exit_code == 0 + assert "tdd-workflow" in listed.output + + export = runner.invoke(cli, ["metrics", "export", "tdd-workflow", "--target", target]) + assert export.exit_code == 0 + lines = [line for line in export.output.splitlines() if line.strip()] + assert len(lines) == 1 + assert json.loads(lines[0])["quality_score"] == 0.85 + + def test_record_json_from_stdin(self, runner: CliRunner, project_dir: Path): + payload = json.dumps({"success": False, "execution_time_s": 9.5}) + result = runner.invoke( + cli, + ["metrics", "record", "coach", "--target", str(project_dir), "--json"], + input=payload, + ) + assert result.exit_code == 0 + + show = runner.invoke(cli, ["metrics", "show", "coach", "--target", str(project_dir)]) + assert '"success": false' in show.output + + def test_record_idempotency_key_skips_duplicate( + self, runner: CliRunner, project_dir: Path + ): + args = [ + "metrics", + "record", + "coach", + "--target", + str(project_dir), + "--success", + "--idempotency-key", + "sess-abc", + ] + first = runner.invoke(cli, args) + second = runner.invoke(cli, args) + assert first.exit_code == 0 + assert second.exit_code == 0 + assert "Skipped duplicate" in second.output + + export = runner.invoke( + cli, ["metrics", "export", "coach", "--target", str(project_dir)] + ) + assert len(export.output.strip().splitlines()) == 1 + + def test_record_requires_outcome_without_json(self, runner: CliRunner, project_dir: Path): + result = runner.invoke( + cli, + ["metrics", "record", "tdd-workflow", "--target", str(project_dir)], + ) + assert result.exit_code != 0 + assert "--success or --failure" in result.output + + def test_memory_init_scaffolds_metrics(self, runner: CliRunner, project_dir: Path): + result = runner.invoke( + cli, + ["memory", "init", "tdd-workflow", "--target", str(project_dir)], + ) + assert result.exit_code == 0 + metrics_dir = project_dir / ".kaizen" / "metrics" / "tdd-workflow" + assert metrics_dir.exists() + assert (metrics_dir / "executions.jsonl").exists() + + def test_memory_init_no_metrics_flag(self, runner: CliRunner, project_dir: Path): + result = runner.invoke( + cli, + ["memory", "init", "coach", "--target", str(project_dir), "--no-metrics"], + ) + assert result.exit_code == 0 + assert not (project_dir / ".kaizen" / "metrics" / "coach").exists() \ No newline at end of file diff --git a/workplans/kaizen-agentic-WP-0003-measurement-loop.md b/workplans/kaizen-agentic-WP-0003-measurement-loop.md index 9fd5341..67e68f5 100644 --- a/workplans/kaizen-agentic-WP-0003-measurement-loop.md +++ b/workplans/kaizen-agentic-WP-0003-measurement-loop.md @@ -121,10 +121,10 @@ kaizen-agentic metrics export # Dump executions.jsonl to stdout ### Tasks -- [ ] T05 — Implement `metrics` CLI command group (record, show, list, export) -- [ ] T06 — Integrate `metrics record` into session-close protocol template for pilot agents -- [ ] T07 — CLI tests for metrics commands (click.testing, temp project dir) -- [ ] T08 — Update `docs/CLI_CHEAT_SHEET.md` and `docs/agency-framework.md` with metrics section +- [x] T05 — Implement `metrics` CLI command group (record, show, list, export) +- [x] T06 — Integrate `metrics record` into session-close protocol template for pilot agents +- [x] T07 — CLI tests for metrics commands (click.testing, temp project dir) +- [x] T08 — Update `docs/CLI_CHEAT_SHEET.md` and `docs/agency-framework.md` with metrics section ### Definition of done