WP-0003 Part 5: tdd-workflow metrics pilot
Add metrics frontmatter and session-close recording to tdd-workflow, document the reference implementation in wiki/AboutKaizenAgents.md, and add an e2e test covering record → show → optimize → brief.
This commit is contained in:
@@ -8,8 +8,10 @@ Tests the full workflow:
|
||||
4. memory brief — verify orientation brief includes own memory and cross-agent context
|
||||
5. protocols list / show — verify protocol discovery works
|
||||
6. memory clear — verify wipe works
|
||||
7. tdd-workflow pilot — record → show → optimize → brief (WP-0003 Part 5)
|
||||
"""
|
||||
|
||||
import json
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
|
||||
@@ -17,6 +19,8 @@ import pytest
|
||||
from click.testing import CliRunner
|
||||
|
||||
from kaizen_agentic.cli import cli
|
||||
from kaizen_agentic.metrics import MetricsStore, OptimizerStore
|
||||
from kaizen_agentic.optimization import MIN_SAMPLES_FOR_RECOMMENDATIONS
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -67,6 +71,34 @@ def _sys_medic_memory() -> str:
|
||||
""")
|
||||
|
||||
|
||||
def _tdd_workflow_memory() -> str:
|
||||
"""Realistic tdd-workflow memory after two issue cycles."""
|
||||
return textwrap.dedent("""\
|
||||
---
|
||||
agent: tdd-workflow
|
||||
project: demo-app
|
||||
last_updated: 2026-06-16
|
||||
session_count: 2
|
||||
---
|
||||
|
||||
## Project Context
|
||||
Python service using TDD8 with Gitea issues and pytest.
|
||||
|
||||
## Accumulated Findings
|
||||
- Sidequests from REFINE often block PUBLISH when lint debt accumulates
|
||||
|
||||
## What Worked
|
||||
- `make tdd-start NUM=X` before writing tests keeps RED phase focused
|
||||
|
||||
## Watch Points
|
||||
- Flaky integration tests under parallel pytest (-n auto)
|
||||
|
||||
## Session Log
|
||||
2026-06-10 · issue 12 metrics store · PUBLISH complete · success
|
||||
2026-06-16 · issue 15 CLI flags · stalled at REFINE · partial
|
||||
""")
|
||||
|
||||
|
||||
def _project_management_memory() -> str:
|
||||
"""Minimal project-management agent memory."""
|
||||
return textwrap.dedent("""\
|
||||
@@ -275,6 +307,104 @@ class TestMemoryClear:
|
||||
assert "nothing to clear" in result.output
|
||||
|
||||
|
||||
class TestTddWorkflowMetricsPilot:
|
||||
"""Full measure → analyse → orient loop for the tdd-workflow pilot agent."""
|
||||
|
||||
def _populate_memory(self, project: Path) -> None:
|
||||
memory_dir = project / ".kaizen" / "agents" / "tdd-workflow"
|
||||
memory_dir.mkdir(parents=True, exist_ok=True)
|
||||
(memory_dir / "memory.md").write_text(_tdd_workflow_memory())
|
||||
|
||||
def test_full_metrics_loop_record_show_optimize_brief(self, project):
|
||||
runner = CliRunner()
|
||||
self._populate_memory(project)
|
||||
|
||||
sessions = [
|
||||
{
|
||||
"success": True,
|
||||
"execution_time_s": 4200.0,
|
||||
"quality_score": 0.92,
|
||||
"primary_metric": {
|
||||
"name": "test_pass_rate",
|
||||
"value": 1.0,
|
||||
"target": 1.0,
|
||||
},
|
||||
"metadata": {"issue": "12", "phase": "PUBLISH"},
|
||||
},
|
||||
{
|
||||
"success": False,
|
||||
"execution_time_s": 5400.0,
|
||||
"quality_score": 0.45,
|
||||
"primary_metric": {
|
||||
"name": "test_pass_rate",
|
||||
"value": 0.78,
|
||||
"target": 1.0,
|
||||
},
|
||||
"metadata": {"issue": "15", "phase": "REFINE"},
|
||||
},
|
||||
]
|
||||
|
||||
for index, payload in enumerate(sessions, start=1):
|
||||
result = runner.invoke(
|
||||
cli,
|
||||
[
|
||||
"metrics",
|
||||
"record",
|
||||
"tdd-workflow",
|
||||
"--target",
|
||||
str(project),
|
||||
"--json",
|
||||
"--idempotency-key",
|
||||
f"session-{index}",
|
||||
],
|
||||
input=json.dumps(payload),
|
||||
)
|
||||
assert result.exit_code == 0, result.output
|
||||
assert "Recorded metrics" in result.output
|
||||
|
||||
show_result = runner.invoke(
|
||||
cli,
|
||||
["metrics", "show", "tdd-workflow", "--target", str(project)],
|
||||
)
|
||||
assert show_result.exit_code == 0
|
||||
assert "test_pass_rate" in show_result.output or "2 execution" in show_result.output.lower()
|
||||
|
||||
store = MetricsStore(project, "tdd-workflow")
|
||||
for i in range(MIN_SAMPLES_FOR_RECOMMENDATIONS - len(sessions)):
|
||||
store.append(
|
||||
{
|
||||
"success": False,
|
||||
"execution_time_s": 90.0 + i,
|
||||
"quality_score": 0.35,
|
||||
"primary_metric": {
|
||||
"name": "test_pass_rate",
|
||||
"value": 0.6,
|
||||
"target": 1.0,
|
||||
},
|
||||
},
|
||||
idempotency_key=f"seed-{i}",
|
||||
)
|
||||
|
||||
optimize_result = runner.invoke(
|
||||
cli,
|
||||
["metrics", "optimize", "tdd-workflow", "--target", str(project)],
|
||||
)
|
||||
assert optimize_result.exit_code == 0, optimize_result.output
|
||||
optimizer = OptimizerStore(project)
|
||||
assert optimizer.analysis_path.exists()
|
||||
assert optimizer.recommendations_path.exists()
|
||||
|
||||
brief_result = runner.invoke(
|
||||
cli,
|
||||
["memory", "brief", "tdd-workflow", "--target", str(project)],
|
||||
)
|
||||
assert brief_result.exit_code == 0
|
||||
assert "## Performance Summary" in brief_result.output
|
||||
assert "Success rate:" in brief_result.output
|
||||
assert "issue 12" in brief_result.output or "TDD8" in brief_result.output
|
||||
assert "Your Memory" in brief_result.output
|
||||
|
||||
|
||||
class TestProtocolsCommand:
|
||||
def test_protocols_list_finds_sys_medic(self):
|
||||
"""Protocols list against the real agents dir should include sys-medic k3s protocol."""
|
||||
|
||||
Reference in New Issue
Block a user