diff --git a/agents/agent-sys-medic.md b/agents/agent-sys-medic.md index c840f9f..56e76f7 100644 --- a/agents/agent-sys-medic.md +++ b/agents/agent-sys-medic.md @@ -2,9 +2,31 @@ name: sys-medic description: Linux/Kubernetes node health assessment agent — diagnoses process, memory, CPU, disk, network, and kubelet issues with safe, prioritized, evidence-driven guidance category: infrastructure +memory: enabled source: sys-medic (~/sys-medic/agent-sys-medic.md) --- +# Session Start Protocol + +1. Check for `.kaizen/agents/sys-medic/memory.md` in the project root. +2. If present, read it — pay particular attention to `## Node Profiles` (known baselines + per host) and `## Recurring Findings` (issues seen before on this infrastructure). +3. Acknowledge memory in your opening brief: note any relevant node profiles or prior findings. +4. If a structured assessment is requested, check for + `agents/protocols/sys-medic/k3s-node-health-assessment.md` and use it as your procedure. + +# Session Close Protocol + +1. Update `## Node Profiles` — add or revise the entry for any host assessed this session + (hostname | typical load | known quirks | last assessment date). +2. Update `## Recurring Findings` — if an issue was seen previously, increment its frequency + and note the date. +3. Update `## Accumulated Findings`, `## What Worked`, `## Watch Points` as appropriate. +4. Append one line to `## Session Log`: `YYYY-MM-DD · · · `. +5. Bump `last_updated` and `session_count`. + +--- + You are SysMedic, a careful coding and systems operations agent for Linux-based Kubernetes environments. Your role is to assess operational health, identify signs of instability, and provide safe, practical guidance to improve system condition. You are not a blind automation bot. You are an evidence-driven operational analyst and remediation advisor. @@ -306,4 +328,30 @@ When invoked, begin by determining the current operational picture and producing - signs of instability - safe guidance for stabilization +If a structured assessment is requested, use the k3s-node-health-assessment protocol +(`agents/protocols/sys-medic/k3s-node-health-assessment.md`) if available. The protocol +provides a step-by-step procedure covering OS baseline, process hygiene, memory, CPU, +disk, network, Kubernetes node state, and k3s runtime health. + If insufficient evidence is available, state exactly which safe inspection commands should be run next. + +--- + +# Memory Template Extensions + +sys-medic's memory file (`.kaizen/agents/sys-medic/memory.md`) extends the base template +(ADR-002) with three additional sections: + +```markdown +## Node Profiles + + + +## Recurring Findings + + +## Cleared Issues + +``` + +These sections are maintained by the session-close protocol above. diff --git a/agents/protocols/README.md b/agents/protocols/README.md new file mode 100644 index 0000000..7f94054 --- /dev/null +++ b/agents/protocols/README.md @@ -0,0 +1,40 @@ +# Agent Protocols + +This directory contains **protocol runbooks** — structured, human-readable procedural documents that kaizen-agentic agents reference during structured assessments or remediation work. + +Protocols are distinct from agent prompts: +- **Agent prompts** (`agents/agent-*.md`) shape AI behaviour +- **Protocols** (`agents/protocols//.md`) are procedural checklists for humans and agents to execute + +See [ADR-003](../../docs/adr/ADR-003-protocols-artifact-convention.md) for the full convention. + +## Structure + +``` +agents/protocols/ + / + .md ← one file per protocol +``` + +## Available Protocols + +| Agent | Protocol | Description | +|-------|----------|-------------| +| sys-medic | [k3s-node-health-assessment](sys-medic/k3s-node-health-assessment.md) | Structured k3s node health check covering kubelet, pods, resources, networking, and storage | + +## Usage + +**From the CLI:** + +```bash +kaizen-agentic protocols list # List all protocols +kaizen-agentic protocols list sys-medic # List sys-medic protocols +kaizen-agentic protocols show sys-medic k3s-node-health-assessment +``` + +**From an agent session:** + +When an agent references a protocol, it will say something like: +> *"Use the k3s-node-health-assessment protocol at `agents/protocols/sys-medic/k3s-node-health-assessment.md` for this assessment."* + +Protocols can also be read and executed directly without an AI agent. diff --git a/agents/protocols/sys-medic/k3s-node-health-assessment.md b/agents/protocols/sys-medic/k3s-node-health-assessment.md new file mode 100644 index 0000000..339ab97 --- /dev/null +++ b/agents/protocols/sys-medic/k3s-node-health-assessment.md @@ -0,0 +1,306 @@ +--- +agent: sys-medic +slug: k3s-node-health-assessment +title: k3s Node Health Assessment +version: 1.0.0 +last_updated: "2026-03-18" +--- + +# k3s Node Health Assessment + +## Purpose + +Structured health assessment for a Linux host running k3s (lightweight Kubernetes). Covers OS baseline, process hygiene, memory, CPU, disk, network, Kubernetes node state, and runtime services. Produces a prioritized findings report with safe next actions. + +## Scope + +- Linux host (any distribution) running k3s +- k3s worker nodes and single-node clusters +- Hosts where `kubectl` and/or `k3s kubectl` are available +- Applies whether the host is healthy, degraded, or in an unknown state + +## Prerequisites + +- Shell access to the target host (SSH or console) +- Ideally: sudo or root access (some checks require it) +- Available tools: `ps`, `top`, `free`, `vmstat`, `iostat`, `ss`, `journalctl`, `systemctl`, `dmesg`, `df`, `du`, `lsof`, `kubectl` or `k3s kubectl` +- Note which tools are absent — record what could not be checked + +--- + +## Procedure + +### Step 1 — OS and Node Baseline + +Establish context before diagnosing anything. + +```bash +hostname +uptime +uname -r +nproc +free -h +swapon --show +df -h +date +``` + +Record: +- Hostname and uptime +- Kernel version +- CPU core count +- Total/used/free memory and swap +- Overall disk usage per mount +- Current time (for correlating log timestamps) + +--- + +### Step 2 — Process Hygiene + +```bash +# Zombie and D-state processes +ps aux | awk '$8 ~ /^[ZD]/ {print}' + +# Top memory consumers +ps aux --sort=-%mem | head -20 + +# Top CPU consumers +ps aux --sort=-%cpu | head -20 + +# Processes with high FD counts (requires lsof) +sudo lsof 2>/dev/null | awk '{print $2}' | sort | uniq -c | sort -rn | head -20 + +# Long-running suspicious processes (> 7 days) +ps -eo pid,user,etime,comm --sort=-etime | head -30 +``` + +Look for: +- Zombie count > 0 +- D-state (uninterruptible sleep) tasks +- Unexpected high-memory or high-CPU processes +- Stale maintenance scripts, port-forwards, debug sessions, rsync, or backup jobs +- Orphaned shells or user sessions + +--- + +### Step 3 — Memory Health + +```bash +# Overall memory picture +free -h +cat /proc/meminfo | grep -E 'MemAvailable|SwapFree|Dirty|Slab|KReclaimable' + +# OOM kill history +sudo dmesg | grep -i 'oom\|killed process' | tail -20 +sudo journalctl -k --since "24 hours ago" | grep -i 'oom\|out of memory' | tail -20 + +# Slab usage +sudo slabtop -o | head -30 + +# cgroup memory pressure (if cgroups v2) +find /sys/fs/cgroup -name "memory.pressure" 2>/dev/null | xargs grep -l "some" 2>/dev/null | head -10 +``` + +Look for: +- Available memory < 10% of total +- Swap being actively used (churn is worse than swap in use) +- Recent OOM kills +- High slab growth +- cgroup memory pressure events + +--- + +### Step 4 — CPU and Scheduler Health + +```bash +# Load average vs core count +uptime +nproc + +# CPU idle and steal +top -bn1 | grep '%Cpu' +vmstat 1 5 + +# Run queue pressure +vmstat 1 5 | awk '{print $1, $2}' # r=running, b=blocked +``` + +Look for: +- Load average persistently > core count +- CPU idle < 10% +- High CPU steal (virtualised hosts) +- Run queue (r) > core count sustained +- Blocked processes (b) > 0 sustained + +--- + +### Step 5 — Disk and Filesystem Health + +```bash +# Disk usage +df -h +df -i # inode usage + +# Large log files +sudo du -sh /var/log/* 2>/dev/null | sort -rh | head -20 +sudo journalctl --disk-usage + +# k3s data directory +sudo du -sh /var/lib/rancher/k3s/ 2>/dev/null +sudo du -sh /var/lib/rancher/k3s/agent/containerd/ 2>/dev/null + +# Rapidly growing dirs (compare two snapshots 60s apart) +sudo du -sh /var/lib/rancher /var/log /tmp 2>/dev/null +``` + +Look for: +- Any mount > 85% full (warning) or > 95% (critical) +- Any mount with inode usage > 85% +- Container image accumulation in containerd storage +- Large or rapidly growing log files +- Abandoned temp files + +--- + +### Step 6 — Network and Connection State + +```bash +# Connection state summary +ss -s +ss -tnp | awk '{print $1}' | sort | uniq -c | sort -rn + +# Unusual listeners +ss -tlnp + +# CLOSE_WAIT accumulation (application socket leak) +ss -tnp | grep CLOSE_WAIT | wc -l + +# TIME_WAIT count (normal but high counts may indicate connection thrash) +ss -tnp | grep TIME_WAIT | wc -l +``` + +Look for: +- CLOSE_WAIT count > 50 (application not closing sockets) +- SYN_RECV accumulation (connection flood or backlog issue) +- Unexpected listeners on unusual ports +- Long-lived unexpected tunnels or port-forwards + +--- + +### Step 7 — Kubernetes Node Health + +```bash +# Node status and conditions +kubectl get node $(hostname) -o wide 2>/dev/null || k3s kubectl get node $(hostname) -o wide + +# Node conditions in detail +kubectl describe node $(hostname) 2>/dev/null | grep -A 10 'Conditions:' + +# Resource pressure +kubectl top node $(hostname) 2>/dev/null + +# Recent node events +kubectl get events --field-selector involvedObject.name=$(hostname) --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + +# Top pods by resource use +kubectl top pods --all-namespaces --sort-by=memory 2>/dev/null | head -20 + +# Restarting pods on this node +kubectl get pods --all-namespaces --field-selector spec.nodeName=$(hostname) 2>/dev/null | awk '$5 > 5 {print}' +``` + +Look for: +- Node Ready=False or Unknown +- MemoryPressure, DiskPressure, PIDPressure, or NetworkUnavailable = True +- Pods with high restart counts (> 5) +- CrashLoopBackOff workloads +- Evicted pods (indicates past resource pressure) + +--- + +### Step 8 — k3s Runtime and Control Services + +```bash +# k3s service status +sudo systemctl status k3s 2>/dev/null || sudo systemctl status k3s-agent + +# k3s recent logs (last 100 lines) +sudo journalctl -u k3s --since "1 hour ago" -n 100 2>/dev/null || \ +sudo journalctl -u k3s-agent --since "1 hour ago" -n 100 + +# containerd status (k3s embedded) +sudo systemctl status containerd 2>/dev/null + +# CNI / flannel if applicable +sudo systemctl status flanneld 2>/dev/null +sudo ip addr show flannel.1 2>/dev/null +``` + +Look for: +- k3s service not running or in failed state +- Repeated restart entries in k3s logs +- PLEG errors, image GC failures, sandbox creation failures +- cgroup-related errors +- API server timeout messages (on worker nodes: etcd or API server unreachable) + +--- + +## Interpretation + +| Signal | Normal | Warning | Critical | +|--------|--------|---------|----------| +| Load average | ≤ core count | 1–2× core count | > 2× sustained | +| Memory available | > 20% | 10–20% | < 10% | +| Disk usage | < 75% | 75–90% | > 90% | +| Inode usage | < 75% | 75–90% | > 90% | +| Zombie count | 0 | 1–5 | > 5 or climbing | +| OOM kills (24h) | 0 | 1–2 | > 2 or recent | +| Pod restarts | < 3 | 3–10 | > 10 or CrashLoop | +| CLOSE_WAIT | < 10 | 10–50 | > 50 | +| Node Ready | True | — | False / Unknown | + +Confidence in findings: +- **High** — direct evidence (OOM kill log, node condition set, error in service log) +- **Medium** — indirect evidence (high memory use without OOM, rising load with no clear cause) +- **Low** — circumstantial (aging process without other indicators) + +--- + +## Remediation + +### High memory pressure + +1. Identify top consumers: `ps aux --sort=-%mem | head -20` +2. Check for OOM history: `dmesg | grep -i oom` +3. If a workload is leaking: restart the specific pod (not the node) +4. If slab is high: check for inode-heavy workloads or NFS mounts +5. Do not drop caches unless explicitly justified — Linux reclaims page cache automatically + +### Disk pressure + +1. Find largest directories: `du -sh /var/lib/rancher/k3s/agent/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/* | sort -rh | head -20` +2. Prune unused container images: `k3s crictl rmi --prune` (safe — only removes unused images) +3. Clear old journal logs: `sudo journalctl --vacuum-size=500M` +4. Identify log-bloating pods and fix their logging config + +### k3s service failing + +1. Check service status: `sudo systemctl status k3s` +2. Check logs: `sudo journalctl -u k3s -n 200` +3. Common causes: etcd data corruption (single-node), API server unreachable (worker), disk full, cert expiry +4. Do not restart k3s without understanding the cause — a restart may mask the issue + +### High pod restart count + +1. Check logs: `kubectl logs --previous` +2. Check events: `kubectl describe pod ` +3. Distinguish OOMKilled (memory limit) from CrashLoop (application error) from Liveness probe failure + +--- + +## Notes + +- This protocol was adapted from the sys-medic agent's structured assessment areas and the sys-medic repo's companion protocol document. +- For single-node k3s clusters, the control plane (server) and data plane (agent) run on the same host — check both `k3s` and `k3s-agent` services. +- On hosts without `kubectl` in PATH, use `k3s kubectl` as a drop-in replacement. +- Protocol version history is tracked via the `version` frontmatter field. Update on significant structural changes. diff --git a/docs/adr/ADR-003-protocols-artifact-convention.md b/docs/adr/ADR-003-protocols-artifact-convention.md new file mode 100644 index 0000000..2191883 --- /dev/null +++ b/docs/adr/ADR-003-protocols-artifact-convention.md @@ -0,0 +1,116 @@ +--- +id: ADR-003 +title: Protocols Artifact Convention +status: accepted +date: "2026-03-18" +--- + +# ADR-003 — Protocols Artifact Convention + +## Status + +Accepted + +## Context + +Some agents perform structured, repeatable assessments or remediation procedures +(e.g. sys-medic's k3s node health assessment). These procedures exist as narrative +text embedded in agent prompts or companion documents, making them hard to discover, +reference, version, or evolve independently of the agent prompt. + +Protocols are distinct from agent prompts: +- Agent prompts shape AI behaviour +- Protocols are procedural checklists for humans and agents to execute + +They need their own artifact type with a stable location and structure. + +## Decision + +### File location + +``` +agents/protocols//.md +``` + +Protocols live inside the `agents/` directory alongside agent definitions, +grouped by owning agent. The `agents/protocols/` subtree is a managed artifact +collection — not executable code, not agent prompts. + +### File structure + +```markdown +--- +agent: +slug: +title: +version: +last_updated: +--- + +# + +## Purpose +<!-- One paragraph: what this protocol checks or achieves --> + +## Scope +<!-- What systems, components, or conditions this protocol applies to --> + +## Prerequisites +<!-- What must be true before starting --> + +## Procedure + +### Step 1 — <name> +<!-- Commands, checks, observations --> + +### Step 2 — <name> +... + +## Interpretation +<!-- How to read the results: what is normal, what is a warning, what requires action --> + +## Remediation +<!-- Common issues and how to resolve them --> + +## Notes +<!-- Version history, known limitations, related protocols --> +``` + +### Lifecycle + +- Protocols are **created** when a repeatable procedure is identified during agent work +- Protocols are **refined** across sessions as the owning agent accumulates experience +- Protocols are **referenced** by agent prompts using the convention: + *"If available, use the `<slug>` protocol at `agents/protocols/<agent-name>/<slug>.md`"* +- Protocols are **human-readable** and can be executed without an AI agent present + +### Relationship to agent memory + +Agent memory captures *what was learned* in a project. Protocols capture *how to +do a repeatable thing* independent of any specific project. A protocol may be +updated based on findings across many projects, but it does not store +project-specific state. + +### CLI interface + +``` +kaizen-agentic protocols list [agent] # List protocols (optionally filtered by agent) +kaizen-agentic protocols show <agent> <slug> # Print a protocol +``` + +`kaizen-agentic memory init sys-medic` will scaffold the sys-medic protocol +directory alongside the memory file when protocols exist for that agent. + +### README + +Each `agents/protocols/` directory contains a `README.md` explaining the +convention and listing available protocols. + +## Consequences + +- Protocols are independently versioned and evolvable without touching agent prompts. +- The `agents/protocols/` directory is part of the kaizen-agentic repo and + distributed alongside agent definitions. +- Operators can view, adapt, or execute protocols without running the CLI. +- The first protocol — sys-medic's k3s node health assessment — migrates from + its current location into `agents/protocols/sys-medic/k3s-node-health-assessment.md`. diff --git a/src/kaizen_agentic/cli.py b/src/kaizen_agentic/cli.py index 3b3c7b2..0971558 100644 --- a/src/kaizen_agentic/cli.py +++ b/src/kaizen_agentic/cli.py @@ -820,6 +820,16 @@ session_count: 0 memory_path.write_text(content) click.echo(f"Initialized memory for '{agent_name}': {memory_path}") + # For agents with protocols, note the protocol location + registry = _get_registry() + protocols_dir = registry.agents_dir / "protocols" / agent_name + if protocols_dir.exists(): + slugs = [f.stem for f in sorted(protocols_dir.glob("*.md")) if f.name != "README.md"] + if slugs: + click.echo(f" Protocols available for '{agent_name}':") + for slug in slugs: + click.echo(f" kaizen-agentic protocols show {agent_name} {slug}") + @memory.command("brief") @click.argument("agent_name") @@ -918,6 +928,79 @@ def memory_clear(agent_name: str, target: str): memory_path.parent.rmdir() +@cli.group() +def protocols(): + """Browse agent protocol runbooks (agents/protocols/<agent>/<slug>.md).""" + pass + + +@protocols.command("list") +@click.argument("agent_name", required=False) +def protocols_list(agent_name: Optional[str]): + """List available protocols, optionally filtered by agent.""" + registry = _get_registry() + protocols_dir = registry.agents_dir / "protocols" + + if not protocols_dir.exists(): + click.echo("No protocols directory found.") + return + + found = [] + agent_dirs = ( + [protocols_dir / agent_name] if agent_name else sorted(protocols_dir.iterdir()) + ) + for agent_dir in agent_dirs: + if not agent_dir.is_dir() or agent_dir.name == "__pycache__": + continue + for protocol_file in sorted(agent_dir.glob("*.md")): + if protocol_file.name == "README.md": + continue + # Try to read title from frontmatter + title = protocol_file.stem.replace("-", " ").title() + try: + content = protocol_file.read_text() + for line in content.splitlines(): + if line.startswith("title:"): + title = line.split(":", 1)[1].strip().strip('"') + break + except Exception: + pass + found.append((agent_dir.name, protocol_file.stem, title)) + + if not found: + if agent_name: + click.echo(f"No protocols found for agent '{agent_name}'.") + else: + click.echo("No protocols found.") + return + + click.echo("Available Protocols:") + click.echo("=" * 40) + current_agent = None + for agent, slug, title in found: + if agent != current_agent: + click.echo(f"\n {agent}:") + current_agent = agent + click.echo(f" • {slug}: {title}") + + +@protocols.command("show") +@click.argument("agent_name") +@click.argument("slug") +def protocols_show(agent_name: str, slug: str): + """Print a protocol runbook.""" + registry = _get_registry() + protocol_path = registry.agents_dir / "protocols" / agent_name / f"{slug}.md" + + if not protocol_path.exists(): + click.echo(f"Protocol not found: {agent_name}/{slug}") + click.echo(f" Expected: {protocol_path}") + click.echo(f" Run: kaizen-agentic protocols list {agent_name}") + return + + click.echo(protocol_path.read_text()) + + def _memory_path(target: str, agent_name: str) -> Path: return Path(target).resolve() / ".kaizen" / "agents" / agent_name / "memory.md"