WP-0003 Part 5: tdd-workflow metrics pilot

Add metrics frontmatter and session-close recording to tdd-workflow, document the reference implementation in wiki/AboutKaizenAgents.md, and add an e2e test covering record → show → optimize → brief.
2026-06-16 01:48:43 +02:00
parent 04fdc249f5
commit fd2edfbe6c
4 changed files with 231 additions and 17 deletions
--- a/tests/test_e2e_agency_framework.py
+++ b/tests/test_e2e_agency_framework.py
@@ -8,8 +8,10 @@ Tests the full workflow:
  4. memory brief — verify orientation brief includes own memory and cross-agent context
  5. protocols list / show — verify protocol discovery works
  6. memory clear — verify wipe works
+  7. tdd-workflow pilot — record → show → optimize → brief (WP-0003 Part 5)
 """

+import json
 import textwrap
 from pathlib import Path

@@ -17,6 +19,8 @@ import pytest
 from click.testing import CliRunner

 from kaizen_agentic.cli import cli
+from kaizen_agentic.metrics import MetricsStore, OptimizerStore
+from kaizen_agentic.optimization import MIN_SAMPLES_FOR_RECOMMENDATIONS


 # ---------------------------------------------------------------------------
@@ -67,6 +71,34 @@ def _sys_medic_memory() -> str:
    """)


+def _tdd_workflow_memory() -> str:
+    """Realistic tdd-workflow memory after two issue cycles."""
+    return textwrap.dedent("""\
+        ---
+        agent: tdd-workflow
+        project: demo-app
+        last_updated: 2026-06-16
+        session_count: 2
+        ---
+
+        ## Project Context
+        Python service using TDD8 with Gitea issues and pytest.
+
+        ## Accumulated Findings
+        - Sidequests from REFINE often block PUBLISH when lint debt accumulates
+
+        ## What Worked
+        - `make tdd-start NUM=X` before writing tests keeps RED phase focused
+
+        ## Watch Points
+        - Flaky integration tests under parallel pytest (-n auto)
+
+        ## Session Log
+        2026-06-10 · issue 12 metrics store · PUBLISH complete · success
+        2026-06-16 · issue 15 CLI flags · stalled at REFINE · partial
+    """)
+
+
 def _project_management_memory() -> str:
    """Minimal project-management agent memory."""
    return textwrap.dedent("""\
@@ -275,6 +307,104 @@ class TestMemoryClear:
        assert "nothing to clear" in result.output


+class TestTddWorkflowMetricsPilot:
+    """Full measure → analyse → orient loop for the tdd-workflow pilot agent."""
+
+    def _populate_memory(self, project: Path) -> None:
+        memory_dir = project / ".kaizen" / "agents" / "tdd-workflow"
+        memory_dir.mkdir(parents=True, exist_ok=True)
+        (memory_dir / "memory.md").write_text(_tdd_workflow_memory())
+
+    def test_full_metrics_loop_record_show_optimize_brief(self, project):
+        runner = CliRunner()
+        self._populate_memory(project)
+
+        sessions = [
+            {
+                "success": True,
+                "execution_time_s": 4200.0,
+                "quality_score": 0.92,
+                "primary_metric": {
+                    "name": "test_pass_rate",
+                    "value": 1.0,
+                    "target": 1.0,
+                },
+                "metadata": {"issue": "12", "phase": "PUBLISH"},
+            },
+            {
+                "success": False,
+                "execution_time_s": 5400.0,
+                "quality_score": 0.45,
+                "primary_metric": {
+                    "name": "test_pass_rate",
+                    "value": 0.78,
+                    "target": 1.0,
+                },
+                "metadata": {"issue": "15", "phase": "REFINE"},
+            },
+        ]
+
+        for index, payload in enumerate(sessions, start=1):
+            result = runner.invoke(
+                cli,
+                [
+                    "metrics",
+                    "record",
+                    "tdd-workflow",
+                    "--target",
+                    str(project),
+                    "--json",
+                    "--idempotency-key",
+                    f"session-{index}",
+                ],
+                input=json.dumps(payload),
+            )
+            assert result.exit_code == 0, result.output
+            assert "Recorded metrics" in result.output
+
+        show_result = runner.invoke(
+            cli,
+            ["metrics", "show", "tdd-workflow", "--target", str(project)],
+        )
+        assert show_result.exit_code == 0
+        assert "test_pass_rate" in show_result.output or "2 execution" in show_result.output.lower()
+
+        store = MetricsStore(project, "tdd-workflow")
+        for i in range(MIN_SAMPLES_FOR_RECOMMENDATIONS - len(sessions)):
+            store.append(
+                {
+                    "success": False,
+                    "execution_time_s": 90.0 + i,
+                    "quality_score": 0.35,
+                    "primary_metric": {
+                        "name": "test_pass_rate",
+                        "value": 0.6,
+                        "target": 1.0,
+                    },
+                },
+                idempotency_key=f"seed-{i}",
+            )
+
+        optimize_result = runner.invoke(
+            cli,
+            ["metrics", "optimize", "tdd-workflow", "--target", str(project)],
+        )
+        assert optimize_result.exit_code == 0, optimize_result.output
+        optimizer = OptimizerStore(project)
+        assert optimizer.analysis_path.exists()
+        assert optimizer.recommendations_path.exists()
+
+        brief_result = runner.invoke(
+            cli,
+            ["memory", "brief", "tdd-workflow", "--target", str(project)],
+        )
+        assert brief_result.exit_code == 0
+        assert "## Performance Summary" in brief_result.output
+        assert "Success rate:" in brief_result.output
+        assert "issue 12" in brief_result.output or "TDD8" in brief_result.output
+        assert "Your Memory" in brief_result.output
+
+
 class TestProtocolsCommand:
    def test_protocols_list_finds_sys_medic(self):
        """Protocols list against the real agents dir should include sys-medic k3s protocol."""