""" End-to-end tests for the agency framework: memory lifecycle and coach orientation. Tests the full workflow: 1. memory init — scaffold a memory file in a test project 2. Populate memory with realistic content (simulating sessions) 3. memory show — verify content is readable 4. memory brief — verify orientation brief includes own memory and cross-agent context 5. protocols list / show — verify protocol discovery works 6. memory clear — verify wipe works 7. tdd-workflow pilot — record → show → optimize → brief (WP-0003 Part 5) """ import json import textwrap from pathlib import Path import pytest from click.testing import CliRunner from kaizen_agentic.cli import cli from kaizen_agentic.metrics import MetricsStore, OptimizerStore from kaizen_agentic.optimization import MIN_SAMPLES_FOR_RECOMMENDATIONS # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _sys_medic_memory() -> str: """Realistic sys-medic memory after two simulated sessions.""" return textwrap.dedent("""\ --- agent: sys-medic project: test-cluster last_updated: 2026-03-18 session_count: 2 --- ## Project Context k3s single-node cluster on an ARM64 host (tegpi-01). No external load balancer. Traefik ingress. Longhorn storage. ## Accumulated Findings - kubelet log rotation was disabled; logs grew to 2.1 GB - containerd image GC threshold was set too high (98%) ## What Worked - `journalctl --vacuum-size=500M` recovered ~1.8 GB without restart - Lowering GC threshold to 80% in containerd config resolved disk pressure ## Watch Points - inotify watch limit hits ceiling under heavy Longhorn load - node has only 4 GB RAM; memory pressure risk during backup windows ## Open Threads - Check whether kube-system namespace daemonsets have resource limits set ## Node Profiles tegpi-01 | load avg ~0.6 at idle | inotify-limited under load | 2026-03-18 ## Recurring Findings - kubelet log growth · first seen 2026-03-10 · 2 occurrences ## Cleared Issues - containerd GC disk pressure · adjusted config 2026-03-18 · resolved ## Session Log 2026-03-10 · tegpi-01 initial assessment · found log bloat + GC issue · recommendations documented 2026-03-18 · tegpi-01 follow-up · verified GC fix; inotify limit noted · watch """) def _tdd_workflow_memory() -> str: """Realistic tdd-workflow memory after two issue cycles.""" return textwrap.dedent("""\ --- agent: tdd-workflow project: demo-app last_updated: 2026-06-16 session_count: 2 --- ## Project Context Python service using TDD8 with Gitea issues and pytest. ## Accumulated Findings - Sidequests from REFINE often block PUBLISH when lint debt accumulates ## What Worked - `make tdd-start NUM=X` before writing tests keeps RED phase focused ## Watch Points - Flaky integration tests under parallel pytest (-n auto) ## Session Log 2026-06-10 · issue 12 metrics store · PUBLISH complete · success 2026-06-16 · issue 15 CLI flags · stalled at REFINE · partial """) def _project_management_memory() -> str: """Minimal project-management agent memory.""" return textwrap.dedent("""\ --- agent: project-management project: test-cluster last_updated: 2026-03-15 session_count: 1 --- ## Project Context Operational runbook project for the k3s home cluster. ## Accumulated Findings - Infra tasks are better tracked in Gitea issues than in TODO files ## Session Log 2026-03-15 · initial planning session · task structure agreed """) # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- @pytest.fixture def project(tmp_path): """A temporary 'project' directory with a name.""" p = tmp_path / "test-cluster" p.mkdir() return p # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- class TestMemoryInit: def test_init_creates_file(self, project): runner = CliRunner() result = runner.invoke( cli, ["memory", "init", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0, result.output assert "Initialized memory" in result.output memory_file = project / ".kaizen" / "agents" / "sys-medic" / "memory.md" assert memory_file.exists() def test_init_file_content_has_required_sections(self, project): runner = CliRunner() runner.invoke(cli, ["memory", "init", "sys-medic", "--target", str(project)]) memory_file = project / ".kaizen" / "agents" / "sys-medic" / "memory.md" content = memory_file.read_text() assert "agent: sys-medic" in content assert "project: test-cluster" in content assert "session_count: 0" in content assert "## Project Context" in content assert "## Accumulated Findings" in content assert "## What Worked" in content assert "## Watch Points" in content assert "## Open Threads" in content assert "## Session Log" in content def test_init_idempotent(self, project): runner = CliRunner() runner.invoke(cli, ["memory", "init", "sys-medic", "--target", str(project)]) result = runner.invoke( cli, ["memory", "init", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0 assert "already exists" in result.output class TestMemoryShow: def test_show_returns_content(self, project): memory_file = project / ".kaizen" / "agents" / "sys-medic" / "memory.md" memory_file.parent.mkdir(parents=True, exist_ok=True) memory_file.write_text(_sys_medic_memory()) runner = CliRunner() result = runner.invoke( cli, ["memory", "show", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0 assert "Node Profiles" in result.output assert "tegpi-01" in result.output def test_show_missing_prints_guidance(self, project): runner = CliRunner() result = runner.invoke( cli, ["memory", "show", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0 assert "No memory found" in result.output assert "memory init" in result.output class TestMemoryBrief: def _populate(self, project): """Write both agent memories into the project.""" sm_dir = project / ".kaizen" / "agents" / "sys-medic" sm_dir.mkdir(parents=True, exist_ok=True) (sm_dir / "memory.md").write_text(_sys_medic_memory()) pm_dir = project / ".kaizen" / "agents" / "project-management" pm_dir.mkdir(parents=True, exist_ok=True) (pm_dir / "memory.md").write_text(_project_management_memory()) def test_brief_includes_own_memory(self, project): self._populate(project) runner = CliRunner() result = runner.invoke( cli, ["memory", "brief", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0 assert "Orientation Brief for: sys-medic" in result.output assert "Your Memory" in result.output assert "tegpi-01" in result.output # content from sys-medic memory def test_brief_includes_cross_agent_context(self, project): self._populate(project) runner = CliRunner() result = runner.invoke( cli, ["memory", "brief", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0 assert "Context From Other Agents" in result.output assert "project-management" in result.output def test_brief_coach_tip_present(self, project): self._populate(project) runner = CliRunner() result = runner.invoke( cli, ["memory", "brief", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0 assert "agent-coach" in result.output def test_brief_no_memory_gives_guidance(self, project): runner = CliRunner() result = runner.invoke( cli, ["memory", "brief", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0 assert "No agent memory files found" in result.output def test_brief_raw_flag_skips_header(self, project): self._populate(project) runner = CliRunner() result = runner.invoke( cli, ["memory", "brief", "sys-medic", "--target", str(project), "--raw"] ) assert result.exit_code == 0 assert "=== sys-medic ===" in result.output # Raw mode should not include the orientation header assert "Orientation Brief for:" not in result.output def test_brief_includes_performance_summary_with_memory_and_metrics(self, project): self._populate(project) runner = CliRunner() runner.invoke( cli, [ "metrics", "record", "sys-medic", "--target", str(project), "--success", "--time", "30", "--quality", "0.88", ], ) runner.invoke( cli, [ "metrics", "record", "project-management", "--target", str(project), "--success", "--time", "15", "--quality", "0.95", ], ) result = runner.invoke( cli, ["memory", "brief", "sys-medic", "--target", str(project)] ) assert result.exit_code == 0 assert "## Performance Summary" in result.output assert "Success rate:" in result.output assert "tegpi-01" in result.output assert "Context From Other Agents" in result.output assert "project-management" in result.output class TestMemoryClear: def test_clear_removes_file(self, project): memory_file = project / ".kaizen" / "agents" / "sys-medic" / "memory.md" memory_file.parent.mkdir(parents=True, exist_ok=True) memory_file.write_text(_sys_medic_memory()) runner = CliRunner() result = runner.invoke( cli, ["memory", "clear", "sys-medic", "--target", str(project)], input="y\n" ) assert result.exit_code == 0 assert not memory_file.exists() def test_clear_missing_is_graceful(self, project): runner = CliRunner() result = runner.invoke( cli, ["memory", "clear", "sys-medic", "--target", str(project)], input="y\n" ) assert result.exit_code == 0 assert "nothing to clear" in result.output class TestTddWorkflowMetricsPilot: """Full measure → analyse → orient loop for the tdd-workflow pilot agent.""" def _populate_memory(self, project: Path) -> None: memory_dir = project / ".kaizen" / "agents" / "tdd-workflow" memory_dir.mkdir(parents=True, exist_ok=True) (memory_dir / "memory.md").write_text(_tdd_workflow_memory()) def test_full_metrics_loop_record_show_optimize_brief(self, project): runner = CliRunner() self._populate_memory(project) sessions = [ { "success": True, "execution_time_s": 4200.0, "quality_score": 0.92, "primary_metric": { "name": "test_pass_rate", "value": 1.0, "target": 1.0, }, "metadata": {"issue": "12", "phase": "PUBLISH"}, }, { "success": False, "execution_time_s": 5400.0, "quality_score": 0.45, "primary_metric": { "name": "test_pass_rate", "value": 0.78, "target": 1.0, }, "metadata": {"issue": "15", "phase": "REFINE"}, }, ] for index, payload in enumerate(sessions, start=1): result = runner.invoke( cli, [ "metrics", "record", "tdd-workflow", "--target", str(project), "--json", "--idempotency-key", f"session-{index}", ], input=json.dumps(payload), ) assert result.exit_code == 0, result.output assert "Recorded metrics" in result.output show_result = runner.invoke( cli, ["metrics", "show", "tdd-workflow", "--target", str(project)], ) assert show_result.exit_code == 0 assert ( "test_pass_rate" in show_result.output or "2 execution" in show_result.output.lower() ) store = MetricsStore(project, "tdd-workflow") for i in range(MIN_SAMPLES_FOR_RECOMMENDATIONS - len(sessions)): store.append( { "success": False, "execution_time_s": 90.0 + i, "quality_score": 0.35, "primary_metric": { "name": "test_pass_rate", "value": 0.6, "target": 1.0, }, }, idempotency_key=f"seed-{i}", ) optimize_result = runner.invoke( cli, ["metrics", "optimize", "tdd-workflow", "--target", str(project)], ) assert optimize_result.exit_code == 0, optimize_result.output optimizer = OptimizerStore(project) assert optimizer.analysis_path.exists() assert optimizer.recommendations_path.exists() brief_result = runner.invoke( cli, ["memory", "brief", "tdd-workflow", "--target", str(project)], ) assert brief_result.exit_code == 0 assert "## Performance Summary" in brief_result.output assert "Success rate:" in brief_result.output assert "issue 12" in brief_result.output or "TDD8" in brief_result.output assert "Your Memory" in brief_result.output class TestProtocolsCommand: def test_protocols_list_finds_sys_medic(self): """Protocols list against the real agents dir should include sys-medic k3s protocol.""" runner = CliRunner() result = runner.invoke(cli, ["protocols", "list"]) assert result.exit_code == 0 assert "sys-medic" in result.output assert "k3s-node-health-assessment" in result.output.replace("-", "-") def test_protocols_list_filtered_by_agent(self): runner = CliRunner() result = runner.invoke(cli, ["protocols", "list", "sys-medic"]) assert result.exit_code == 0 assert "k3s" in result.output.lower() def test_protocols_show_outputs_content(self): runner = CliRunner() result = runner.invoke( cli, ["protocols", "show", "sys-medic", "k3s-node-health-assessment"] ) assert result.exit_code == 0 # Protocol should contain key structural sections assert "k3s" in result.output.lower() assert "Prerequisites" in result.output or "Scope" in result.output def test_protocols_list_unknown_agent_no_crash(self): runner = CliRunner() result = runner.invoke(cli, ["protocols", "list", "nonexistent-agent"]) assert result.exit_code == 0 assert "No protocols found" in result.output