From fd2edfbe6c18effd08686b4ad66bae4f275cc648 Mon Sep 17 00:00:00 2001 From: tegwick Date: Tue, 16 Jun 2026 01:48:43 +0200 Subject: [PATCH] WP-0003 Part 5: tdd-workflow metrics pilot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add metrics frontmatter and session-close recording to tdd-workflow, document the reference implementation in wiki/AboutKaizenAgents.md, and add an e2e test covering record → show → optimize → brief. --- agents/agent-tdd-workflow.md | 32 +++++ tests/test_e2e_agency_framework.py | 130 ++++++++++++++++++ wiki/AboutKaizenAgents.md | 76 ++++++++-- ...kaizen-agentic-WP-0003-measurement-loop.md | 10 +- 4 files changed, 231 insertions(+), 17 deletions(-) diff --git a/agents/agent-tdd-workflow.md b/agents/agent-tdd-workflow.md index e7591e3..3267249 100644 --- a/agents/agent-tdd-workflow.md +++ b/agents/agent-tdd-workflow.md @@ -2,6 +2,21 @@ name: tdd-workflow description: Expert guidance for the TDD8 workflow methodology, specializing in the comprehensive ISSUE-TEST-RED-GREEN-REFACTOR-DOCUMENT-REFINE-PUBLISH cycle with sophisticated sidequest management and proper test organization. category: development-process +memory: enabled +metrics: + primary: + name: test_pass_rate + description: Share of acceptance-criteria tests passing at PUBLISH + measurement: passing_tests / total_tests for the active issue workspace + target: 1.0 + secondary: + - name: cycle_time_s + description: Wall-clock time from ISSUE start to PUBLISH + measurement: Session duration in seconds (execution_time_s in ADR-004) + collection: + frequency: per_execution + storage: .kaizen/metrics/tdd-workflow/ + retention: 180d --- # TDDAi Assistant Agent @@ -372,3 +387,20 @@ The comprehensive 8-step development methodology that transforms requirements in 2. Update `## What Worked` and `## Watch Points` as needed. 3. Append one line to `## Session Log`: `YYYY-MM-DD · · `. 4. Bump `last_updated` to today and increment `session_count`. +5. Record session metrics (ADR-004; adjust values to match outcome): + +```bash +# Successful PUBLISH — all acceptance tests green: +echo '{"success": true, "execution_time_s": , "quality_score": 0.9, "primary_metric": {"name": "test_pass_rate", "value": 1.0, "target": 1.0}, "metadata": {"issue": "", "phase": "PUBLISH"}}' \ + | kaizen-agentic metrics record tdd-workflow --json --idempotency-key + +# Incomplete or failed cycle: +echo '{"success": false, "execution_time_s": , "quality_score": 0.4, "primary_metric": {"name": "test_pass_rate", "value": , "target": 1.0}, "metadata": {"issue": "", "phase": ""}}' \ + | kaizen-agentic metrics record tdd-workflow --json --idempotency-key +``` + +Shorthand when only outcome and duration matter: + +```bash +kaizen-agentic metrics record tdd-workflow --success --time --quality <0.0-1.0> +``` diff --git a/tests/test_e2e_agency_framework.py b/tests/test_e2e_agency_framework.py index 06553c0..670c25f 100644 --- a/tests/test_e2e_agency_framework.py +++ b/tests/test_e2e_agency_framework.py @@ -8,8 +8,10 @@ Tests the full workflow: 4. memory brief — verify orientation brief includes own memory and cross-agent context 5. protocols list / show — verify protocol discovery works 6. memory clear — verify wipe works + 7. tdd-workflow pilot — record → show → optimize → brief (WP-0003 Part 5) """ +import json import textwrap from pathlib import Path @@ -17,6 +19,8 @@ import pytest from click.testing import CliRunner from kaizen_agentic.cli import cli +from kaizen_agentic.metrics import MetricsStore, OptimizerStore +from kaizen_agentic.optimization import MIN_SAMPLES_FOR_RECOMMENDATIONS # --------------------------------------------------------------------------- @@ -67,6 +71,34 @@ def _sys_medic_memory() -> str: """) +def _tdd_workflow_memory() -> str: + """Realistic tdd-workflow memory after two issue cycles.""" + return textwrap.dedent("""\ + --- + agent: tdd-workflow + project: demo-app + last_updated: 2026-06-16 + session_count: 2 + --- + + ## Project Context + Python service using TDD8 with Gitea issues and pytest. + + ## Accumulated Findings + - Sidequests from REFINE often block PUBLISH when lint debt accumulates + + ## What Worked + - `make tdd-start NUM=X` before writing tests keeps RED phase focused + + ## Watch Points + - Flaky integration tests under parallel pytest (-n auto) + + ## Session Log + 2026-06-10 · issue 12 metrics store · PUBLISH complete · success + 2026-06-16 · issue 15 CLI flags · stalled at REFINE · partial + """) + + def _project_management_memory() -> str: """Minimal project-management agent memory.""" return textwrap.dedent("""\ @@ -275,6 +307,104 @@ class TestMemoryClear: assert "nothing to clear" in result.output +class TestTddWorkflowMetricsPilot: + """Full measure → analyse → orient loop for the tdd-workflow pilot agent.""" + + def _populate_memory(self, project: Path) -> None: + memory_dir = project / ".kaizen" / "agents" / "tdd-workflow" + memory_dir.mkdir(parents=True, exist_ok=True) + (memory_dir / "memory.md").write_text(_tdd_workflow_memory()) + + def test_full_metrics_loop_record_show_optimize_brief(self, project): + runner = CliRunner() + self._populate_memory(project) + + sessions = [ + { + "success": True, + "execution_time_s": 4200.0, + "quality_score": 0.92, + "primary_metric": { + "name": "test_pass_rate", + "value": 1.0, + "target": 1.0, + }, + "metadata": {"issue": "12", "phase": "PUBLISH"}, + }, + { + "success": False, + "execution_time_s": 5400.0, + "quality_score": 0.45, + "primary_metric": { + "name": "test_pass_rate", + "value": 0.78, + "target": 1.0, + }, + "metadata": {"issue": "15", "phase": "REFINE"}, + }, + ] + + for index, payload in enumerate(sessions, start=1): + result = runner.invoke( + cli, + [ + "metrics", + "record", + "tdd-workflow", + "--target", + str(project), + "--json", + "--idempotency-key", + f"session-{index}", + ], + input=json.dumps(payload), + ) + assert result.exit_code == 0, result.output + assert "Recorded metrics" in result.output + + show_result = runner.invoke( + cli, + ["metrics", "show", "tdd-workflow", "--target", str(project)], + ) + assert show_result.exit_code == 0 + assert "test_pass_rate" in show_result.output or "2 execution" in show_result.output.lower() + + store = MetricsStore(project, "tdd-workflow") + for i in range(MIN_SAMPLES_FOR_RECOMMENDATIONS - len(sessions)): + store.append( + { + "success": False, + "execution_time_s": 90.0 + i, + "quality_score": 0.35, + "primary_metric": { + "name": "test_pass_rate", + "value": 0.6, + "target": 1.0, + }, + }, + idempotency_key=f"seed-{i}", + ) + + optimize_result = runner.invoke( + cli, + ["metrics", "optimize", "tdd-workflow", "--target", str(project)], + ) + assert optimize_result.exit_code == 0, optimize_result.output + optimizer = OptimizerStore(project) + assert optimizer.analysis_path.exists() + assert optimizer.recommendations_path.exists() + + brief_result = runner.invoke( + cli, + ["memory", "brief", "tdd-workflow", "--target", str(project)], + ) + assert brief_result.exit_code == 0 + assert "## Performance Summary" in brief_result.output + assert "Success rate:" in brief_result.output + assert "issue 12" in brief_result.output or "TDD8" in brief_result.output + assert "Your Memory" in brief_result.output + + class TestProtocolsCommand: def test_protocols_list_finds_sys_medic(self): """Protocols list against the real agents dir should include sys-medic k3s protocol.""" diff --git a/wiki/AboutKaizenAgents.md b/wiki/AboutKaizenAgents.md index 96eb431..6159825 100644 --- a/wiki/AboutKaizenAgents.md +++ b/wiki/AboutKaizenAgents.md @@ -1,24 +1,76 @@ -AboutKaizenAgents +# About Kaizen Agents -*Basic concepts of Kaizen Agents* +Basic concepts of Kaizen Agents. -All Kaizen Agents follow the KaizenAgentTemplateDefinition +All Kaizen Agents follow the [KaizenAgentTemplate](KaizenAgentTemplate.md) definition. +That template provides a comprehensive structure for defining Kaizen Agent subagents. -This template provides a comprehensive structure for defining KaizenAgent subagents. +Key sections: -The key sections are: +- **Specification** — declarative outcomes rather than implementation steps +- **Idempotency design** — detect and handle already-completed work +- **Metrics** — measurable success criteria from day one +- **Testing** — scenarios that feed the optimization loop +- **Evolution tracking** — improvement history and performance trends -Specification: Focuses on declarative outcomes rather than implementation steps, making agents more maintainable and testable. +The template enforces separation of concerns, testability, and measurability while +keeping agent definitions consistent across the fleet. -Idempotency Design: Forces you to think upfront about how the agent will detect and handle already-completed work. +--- -Metrics: Ensures every agent has measurable success criteria from day one. +## Metrics-enabled pilot: `tdd-workflow` -Testing: Built-in test scenarios that can be automated as part of the optimization loop. +`tdd-workflow` is the reference implementation for project-scoped metrics (WP-0003). +Use it as a template when adding metrics to other agents. -Evolution Tracking: Maintains a history of improvements and provides hooks for the KaizenAgent to analyze performance trends. +### What is measured -The template enforces our design principles - separation of concerns, testability, and measurability - while providing enough structure to ensure consistency across different coding subagents. +| Metric | Role | How | +|--------|------|-----| +| `test_pass_rate` | Primary | Passing tests ÷ total tests at PUBLISH (target: 1.0) | +| `cycle_time_s` | Secondary | Session duration (`execution_time_s` in ADR-004) | +Definitions live in the agent frontmatter (`agents/agent-tdd-workflow.md`). -xxx +### Where data lives + +``` +/.kaizen/metrics/tdd-workflow/ + executions.jsonl # append-only per-session records + summary.json # rolling aggregates (auto-generated) +``` + +Scaffolded by `kaizen-agentic memory init tdd-workflow` alongside +`.kaizen/agents/tdd-workflow/memory.md`. + +### Session-close loop + +At the end of each TDD8 session: + +1. Update qualitative memory (`## Session Log`, findings, watch points). +2. Record quantitative outcome: + +```bash +kaizen-agentic metrics record tdd-workflow --success --time --quality <0.0-1.0> +``` + +Or pass a full ADR-004 record with `primary_metric` via `--json` (see agent spec). + +### Analysis and orientation + +| Command | Purpose | +|---------|---------| +| `kaizen-agentic metrics show tdd-workflow` | Summary + recent executions | +| `kaizen-agentic metrics optimize tdd-workflow` | Evidence-based recommendations (≥10 records) | +| `kaizen-agentic memory brief tdd-workflow` | Qualitative memory + `## Performance Summary` | + +Fleet-level session analytics remain in **agentic-resources** (Helix Forge); project +metrics stay in `.kaizen/metrics/` per [ADR-004](../docs/adr/ADR-004-project-metrics-convention.md) +and [EcosystemIntegration](EcosystemIntegration.md). + +### Adopting metrics on another agent + +1. Add a `metrics:` block to frontmatter (primary + secondary + collection). +2. Copy the session-close `metrics record` step from `agent-tdd-workflow.md`. +3. Run `kaizen-agentic memory init ` to scaffold storage. +4. Verify with `metrics show` after one session. \ No newline at end of file diff --git a/workplans/kaizen-agentic-WP-0003-measurement-loop.md b/workplans/kaizen-agentic-WP-0003-measurement-loop.md index ec02597..3b42eac 100644 --- a/workplans/kaizen-agentic-WP-0003-measurement-loop.md +++ b/workplans/kaizen-agentic-WP-0003-measurement-loop.md @@ -9,7 +9,7 @@ owner: kaizen-agentic topic_slug: custodian state_hub_workstream_id: 36252a45-f360-4496-bf77-17b5dfb02767 created: "2026-06-16" -updated: "2026-06-17" +updated: "2026-06-18" --- # KAIZEN-WP-0003 — Measurement Loop: Metrics Convention, Collection, and Optimizer Integration @@ -179,10 +179,10 @@ Prove the loop end-to-end on one agent before fleet-wide rollout. ### Tasks -- [ ] T17 — Add `metrics` section to `agent-tdd-workflow.md` frontmatter (primary: test-pass rate; secondary: cycle time) -- [ ] T18 — Add session-close step: invoke `kaizen-agentic metrics record tdd-workflow` with session outcome -- [ ] T19 — Document pilot in `wiki/AboutKaizenAgents.md` as reference implementation -- [ ] T20 — E2e test: two simulated tdd-workflow sessions → metrics accumulate → optimize produces recommendation +- [x] T17 — Add `metrics` section to `agent-tdd-workflow.md` frontmatter (primary: test-pass rate; secondary: cycle time) +- [x] T18 — Add session-close step: invoke `kaizen-agentic metrics record tdd-workflow` with session outcome +- [x] T19 — Document pilot in `wiki/AboutKaizenAgents.md` as reference implementation +- [x] T20 — E2e test: two simulated tdd-workflow sessions → metrics accumulate → optimize produces recommendation ### Definition of done