From 04fdc249f547416693ea0a344374f7abf0fe410b Mon Sep 17 00:00:00 2001 From: tegwick Date: Tue, 16 Jun 2026 01:46:51 +0200 Subject: [PATCH] Bridge Coach memory brief with project metrics summaries. Add Performance Summary block to memory brief, document metrics synthesis in agent-coach, and add e2e and CLI tests for qualitative plus quantitative briefs. --- agents/agent-coach.md | 21 +++++++++ docs/agency-framework.md | 4 +- src/kaizen_agentic/cli.py | 13 +++++- src/kaizen_agentic/metrics.py | 30 +++++++++++++ tests/test_e2e_agency_framework.py | 43 +++++++++++++++++++ tests/test_metrics_cli.py | 24 +++++++++++ ...kaizen-agentic-WP-0003-measurement-loop.md | 6 +-- 7 files changed, 134 insertions(+), 7 deletions(-) diff --git a/agents/agent-coach.md b/agents/agent-coach.md index db95a4c..7e4a93b 100644 --- a/agents/agent-coach.md +++ b/agents/agent-coach.md @@ -83,6 +83,24 @@ root. Each follows ADR-002 structure: When synthesising, weight `## Watch Points` and `## Open Threads` most heavily — these are the signals most likely to be actionable for another agent. +### Project metrics (ADR-004) + +Quantitative performance data lives at `.kaizen/metrics//summary.json`. +`kaizen-agentic memory brief ` includes a `## Performance Summary` block +when metrics exist. + +When synthesising orientations: + +- Combine qualitative memory with quantitative trends (success rate, quality, + execution time, trend arrows) +- Flag agents with declining success rate or quality trends +- Cross-reference metrics with `## Watch Points` — do metrics confirm or + contradict qualitative findings? +- Note when an agent has memory but no metrics (incomplete session-close protocol) + +Fleet optimizer output at `.kaizen/metrics/optimizer/analysis.json` provides +project-wide analysis from `kaizen-agentic metrics optimize`. + --- ## Output Format @@ -115,6 +133,9 @@ Project: Generated: Sources: +### Performance Summary +/ when available — success rate, quality, trends> + ### What to Know First <3–5 most important facts for this agent> diff --git a/docs/agency-framework.md b/docs/agency-framework.md index 8da4abf..d63bf78 100644 --- a/docs/agency-framework.md +++ b/docs/agency-framework.md @@ -262,8 +262,8 @@ kaizen-agentic metrics export # Dump executions.jsonl kaizen-agentic metrics optimize [agent] # Run optimizer on project metrics (≥10 records) ``` -`memory brief` includes a `## Performance Summary` when metrics exist (WP-0003 -Part 4). +`memory brief` includes a `## Performance Summary` when metrics exist (success +rate, avg quality, execution time, trend arrows). `memory init` scaffolds `.kaizen/metrics//` by default (`--no-metrics` to skip). Record outcomes at session close per diff --git a/src/kaizen_agentic/cli.py b/src/kaizen_agentic/cli.py index a7dae4b..11a2620 100644 --- a/src/kaizen_agentic/cli.py +++ b/src/kaizen_agentic/cli.py @@ -11,7 +11,7 @@ from typing import List, Optional from .registry import AgentRegistry, AgentCategory from .installer import AgentInstaller, ProjectInitializer, InstallationConfig -from .metrics import MetricsStore, OptimizerStore +from .metrics import MetricsStore, OptimizerStore, performance_summary_markdown from .optimization import OptimizationLoop, MIN_SAMPLES_FOR_RECOMMENDATIONS @@ -892,12 +892,21 @@ def memory_brief(agent_name: str, target: str, raw: bool): click.echo(f"Sources: {', '.join(sources) if sources else 'none'}") click.echo() - if not sources: + metrics_store = MetricsStore(project_root, agent_name) + metrics_summary = metrics_store.read_summary() + if metrics_summary is None and metrics_store.executions_path.exists(): + metrics_summary = metrics_store.write_summary() + + if not sources and not metrics_summary: click.echo("No agent memory files found in this project.") click.echo(f" Run: kaizen-agentic memory init {agent_name}") click.echo(" Then load the coach agent (agents/agent-coach.md) for synthesis.") return + performance_block = performance_summary_markdown(metrics_summary or {}) + if performance_block: + click.echo(performance_block) + # Own memory section if own_memory: click.echo("### Your Memory") diff --git a/src/kaizen_agentic/metrics.py b/src/kaizen_agentic/metrics.py index ea37a7a..157a4aa 100644 --- a/src/kaizen_agentic/metrics.py +++ b/src/kaizen_agentic/metrics.py @@ -21,6 +21,36 @@ def _parse_timestamp(value: str) -> datetime: return datetime.fromisoformat(normalized) +_TREND_ARROWS = {"up": "↑", "down": "↓", "stable": "→", "unknown": "?"} + + +def performance_summary_markdown(summary: Dict[str, Any]) -> str: + """Format ADR-004 summary.json as a Coach brief markdown section.""" + if not summary or summary.get("execution_count", 0) == 0: + return "" + + trend = summary.get("trend", {}) + success_trend = trend.get("success_rate", "unknown") + quality_trend = trend.get("quality_score", "unknown") + + lines = [ + "## Performance Summary", + "", + f"- Executions: {summary['execution_count']}", + ( + f"- Success rate: {summary['success_rate']:.1%} " + f"({_TREND_ARROWS.get(success_trend, '?')} {success_trend})" + ), + f"- Avg quality: {summary['avg_quality_score']:.2f} " + f"({_TREND_ARROWS.get(quality_trend, '?')} {quality_trend})", + f"- Avg execution time: {summary['avg_execution_time_s']:.1f}s", + ] + if summary.get("last_execution"): + lines.append(f"- Last execution: {summary['last_execution']}") + lines.append("") + return "\n".join(lines) + + def _trend_direction(recent: List[float], prior: List[float]) -> str: if not recent: return "unknown" diff --git a/tests/test_e2e_agency_framework.py b/tests/test_e2e_agency_framework.py index ab478f7..06553c0 100644 --- a/tests/test_e2e_agency_framework.py +++ b/tests/test_e2e_agency_framework.py @@ -209,6 +209,49 @@ class TestMemoryBrief: # Raw mode should not include the orientation header assert "Orientation Brief for:" not in result.output + def test_brief_includes_performance_summary_with_memory_and_metrics(self, project): + self._populate(project) + runner = CliRunner() + runner.invoke( + cli, + [ + "metrics", + "record", + "sys-medic", + "--target", + str(project), + "--success", + "--time", + "30", + "--quality", + "0.88", + ], + ) + runner.invoke( + cli, + [ + "metrics", + "record", + "project-management", + "--target", + str(project), + "--success", + "--time", + "15", + "--quality", + "0.95", + ], + ) + + result = runner.invoke(cli, ["memory", "brief", "sys-medic", "--target", str(project)]) + + assert result.exit_code == 0 + assert "## Performance Summary" in result.output + assert "Success rate:" in result.output + assert "tegpi-01" in result.output + assert "Context From Other Agents" in result.output + assert "project-management" in result.output + class TestMemoryClear: def test_clear_removes_file(self, project): diff --git a/tests/test_metrics_cli.py b/tests/test_metrics_cli.py index f811b54..c739fdb 100644 --- a/tests/test_metrics_cli.py +++ b/tests/test_metrics_cli.py @@ -114,6 +114,30 @@ class TestMetricsCli: assert metrics_dir.exists() assert (metrics_dir / "executions.jsonl").exists() + def test_memory_brief_includes_performance_summary( + self, runner: CliRunner, project_dir: Path + ): + target = str(project_dir) + runner.invoke(cli, ["memory", "init", "tdd-workflow", "--target", target]) + runner.invoke( + cli, + [ + "metrics", + "record", + "tdd-workflow", + "--target", + target, + "--success", + "--quality", + "0.9", + ], + ) + + result = runner.invoke(cli, ["memory", "brief", "tdd-workflow", "--target", target]) + assert result.exit_code == 0 + assert "## Performance Summary" in result.output + assert "Success rate: 100.0%" in result.output + def test_memory_init_no_metrics_flag(self, runner: CliRunner, project_dir: Path): result = runner.invoke( cli, diff --git a/workplans/kaizen-agentic-WP-0003-measurement-loop.md b/workplans/kaizen-agentic-WP-0003-measurement-loop.md index 9824cfd..ec02597 100644 --- a/workplans/kaizen-agentic-WP-0003-measurement-loop.md +++ b/workplans/kaizen-agentic-WP-0003-measurement-loop.md @@ -160,9 +160,9 @@ Unify qualitative memory and quantitative metrics in the orientation path. ### Tasks -- [ ] T14 — Extend `memory brief` to include metrics summary for target agent (recent success rate, avg quality, trend arrow) -- [ ] T15 — Extend `agent-coach.md` to reference metrics context in synthesis instructions -- [ ] T16 — E2e test: populate memory + metrics for two agents → `memory brief` includes both qualitative and quantitative sections +- [x] T14 — Extend `memory brief` to include metrics summary for target agent (recent success rate, avg quality, trend arrow) +- [x] T15 — Extend `agent-coach.md` to reference metrics context in synthesis instructions +- [x] T16 — E2e test: populate memory + metrics for two agents → `memory brief` includes both qualitative and quantitative sections ### Definition of done