From 06767ef9249eb59103d3f861fbde760c68cdb604 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sat, 6 Jun 2026 22:12:30 +0200 Subject: [PATCH] session-memory Phase 1: Grok adapter (T02) - adapters/grok.py: reads the per-session dir (summary.json + chat_history.jsonl + events.jsonl + updates.jsonl); conversation from chat_history, lifecycle/ turn from events, tool-call names paired in order from updates ACP stream - registered in ingest dispatch; codex+grok sources enabled in config.toml - tests/test_grok_adapter.py (synthetic + real local sessions) - live multi-flavor dry-run discovers 89 sessions across flavors Co-Authored-By: Claude Opus 4.8 --- session_memory/adapters/grok.py | 182 ++++++++++++++++++ session_memory/config.toml | 8 +- session_memory/ingest.py | 2 + tests/test_grok_adapter.py | 92 +++++++++ .../AGENTIC-WP-0003-session-memory-phase1.md | 2 +- 5 files changed, 282 insertions(+), 4 deletions(-) create mode 100644 session_memory/adapters/grok.py create mode 100644 tests/test_grok_adapter.py diff --git a/session_memory/adapters/grok.py b/session_memory/adapters/grok.py new file mode 100644 index 0000000..4e61ab6 --- /dev/null +++ b/session_memory/adapters/grok.py @@ -0,0 +1,182 @@ +"""Grok CLI collector adapter — Tier 0 -> Tier 1 (design §2.3, §4.3). + +A Grok session is a *directory* ``~/.grok/sessions///`` containing +``summary.json`` (metadata), ``chat_history.jsonl`` (the canonical transcript), +``events.jsonl`` (explicit lifecycle + ``turn_number``), and ``updates.jsonl`` +(ACP ``session/update`` stream, which carries tool-call names/args). + +The ingest glob matches ``chat_history.jsonl``; this adapter derives its sibling +files from the same directory. Conversation order is taken from +``chat_history.jsonl``; tool-call names are paired, in order, from +``updates.jsonl`` ``tool_call`` entries to classify edits/test runs. +""" + +from __future__ import annotations + +import json +import os +from typing import Any, Optional + +from ..core.schema import Cost, Session, SessionEvent +from .common import ( + Normalized, + classify_tool, + first_line, + iter_jsonl, + now_iso, + resolve_repo, + seconds_between, + stringify, +) + +FLAVOR = "grok" + + +def _text_content(content: Any) -> str: + if isinstance(content, str): + return content + if isinstance(content, list): + return "\n".join( + (b.get("text") or "") for b in content if isinstance(b, dict) + ) + return "" + + +def _tool_calls_in_order(session_dir: str) -> list[dict[str, Any]]: + """Ordered list of {title, rawInput} from updates.jsonl tool_call entries.""" + calls: list[dict[str, Any]] = [] + upd = os.path.join(session_dir, "updates.jsonl") + if not os.path.exists(upd): + return calls + for rec in iter_jsonl(upd): + u = (rec.get("params") or {}).get("update") or {} + if u.get("sessionUpdate") == "tool_call": + calls.append({"title": u.get("title") or "", "rawInput": u.get("rawInput") or {}, + "id": u.get("toolCallId")}) + return calls + + +def _session_meta(session_dir: str) -> dict[str, Any]: + p = os.path.join(session_dir, "summary.json") + if not os.path.exists(p): + return {} + try: + with open(p, "r", encoding="utf-8") as f: + return json.load(f) + except (OSError, ValueError): + return {} + + +def _lifecycle(session_dir: str) -> tuple[list[dict[str, Any]], Optional[str]]: + """events.jsonl records + the model id seen there.""" + evs, model = [], None + p = os.path.join(session_dir, "events.jsonl") + if os.path.exists(p): + for rec in iter_jsonl(p): + evs.append(rec) + model = model or rec.get("model_id") + return evs, model + + +def parse_session(path: str, repo_domain_map: Optional[dict[str, str]] = None) -> Optional[Normalized]: + repo_domain_map = repo_domain_map or {} + # accept either the chat_history.jsonl path or the session dir + session_dir = path if os.path.isdir(path) else os.path.dirname(path) + chat = os.path.join(session_dir, "chat_history.jsonl") + if not os.path.exists(chat): + return None + + meta = _session_meta(session_dir) + info = meta.get("info") or {} + session_id = info.get("id") or os.path.basename(session_dir.rstrip("/")) + cwd = info.get("cwd") or meta.get("git_root_dir") + life_events, life_model = _lifecycle(session_dir) + model = meta.get("current_model_id") or life_model + pending_calls = _tool_calls_in_order(session_dir) + call_idx = 0 + + events: list[SessionEvent] = [] + blobs: dict[str, str] = {} + seq = 0 + + def add(kind, *, role=None, tool=None, summary=None, body=None, parent_seq=None) -> int: + nonlocal seq + s = seq + seq += 1 + ref = None + if body: + ref = f"blob://{session_id}/{s}" + blobs[ref] = body + events.append(SessionEvent( + session_uid=Session.make_uid(FLAVOR, session_id), seq=s, parent_seq=parent_seq, + ts=None, kind=kind, role=role, tool=tool, + summary=(summary or "")[:300] or None, payload_ref=ref, + )) + return s + + # explicit lifecycle first (turn_started/turn_ended carry no bodies) + for le in life_events: + t = le.get("type") + if t in ("turn_started", "loop_started", "turn_ended", "phase_changed"): + add("lifecycle", summary=t) + + for rec in iter_jsonl(chat): + rtype = rec.get("type") + content = rec.get("content") + if rtype == "user": + text = _text_content(content) + if text.strip(): + add("user_msg", role="user", summary=first_line(text), body=text) + elif rtype == "reasoning": + text = _text_content(content) + if text.strip(): + add("thinking", role="assistant", summary="reasoning", body=text) + elif rtype == "assistant": + text = _text_content(content) + if text.strip(): + add("assistant_msg", role="assistant", summary=first_line(text), body=text) + elif rtype == "tool_result": + # pair with the next tool_call (in order) to recover name/args + tool = None + parent = None + if call_idx < len(pending_calls): + call = pending_calls[call_idx] + call_idx += 1 + tool = call["title"] + cmd = stringify(call["rawInput"]) + kind = classify_tool(tool, cmd) + parent = add(kind, role="assistant", tool=tool, summary=tool, body=cmd) + body = _text_content(content) if not isinstance(content, str) else content + add("tool_result", role="tool", tool=tool, summary="tool result", + body=stringify(body), parent_seq=parent) + + if not events: + return None + + cost = Cost(turns=sum(1 for e in events if e.kind == "user_msg")) + started = info.get("created_at") or meta.get("created_at") + ended = meta.get("last_active_at") or info.get("updated_at") or meta.get("updated_at") + cost.wall_clock_s = seconds_between(started, ended) + + repo, domain = resolve_repo(cwd, repo_domain_map) + session = Session( + session_uid=Session.make_uid(FLAVOR, session_id), flavor=FLAVOR, + native_session_id=session_id, repo=repo, domain=domain, cwd=cwd, + git_branch=meta.get("head_branch"), model=model, + started_at=started, ended_at=ended, outcome="unknown", cost=cost, + source_path=chat, + source_bytes=_dir_bytes(session_dir), + discovered_at=now_iso(), + ) + return Normalized(session=session, events=events, blobs=blobs) + + +def _dir_bytes(d: str) -> int: + total = 0 + for root, _, files in os.walk(d): + for f in files: + try: + total += os.path.getsize(os.path.join(root, f)) + except OSError: + pass + return total diff --git a/session_memory/config.toml b/session_memory/config.toml index a7f3be4..3a02bd5 100644 --- a/session_memory/config.toml +++ b/session_memory/config.toml @@ -20,14 +20,14 @@ root = "~/.claude/projects" # glob, relative to root; covers sessions and agent-* sidechains glob = "*/*.jsonl" -# Codex / Grok adapters land in Phase 1 (schemas confirmed in the design doc). +# Codex / Grok adapters added in Phase 1 (AGENTIC-WP-0003). [sources.codex] -enabled = false +enabled = true root = "~/.codex/sessions" glob = "*/*/*/rollout-*.jsonl" [sources.grok] -enabled = false +enabled = true root = "~/.grok/sessions" glob = "*/*/chat_history.jsonl" @@ -37,3 +37,5 @@ agentic-resources = "helix_forge" the-custodian = "custodian" state-hub = "custodian" ops-bridge = "custodian" +net-kingdom = "netkingdom" +can-you-assist = "coulomb_social" diff --git a/session_memory/ingest.py b/session_memory/ingest.py index 089b340..ff34bc0 100644 --- a/session_memory/ingest.py +++ b/session_memory/ingest.py @@ -20,6 +20,7 @@ from typing import Any from .adapters import claude as claude_adapter from .adapters import codex as codex_adapter +from .adapters import grok as grok_adapter from .core import digest as digest_mod from .core.cursor import Cursors from .core.retention import RetentionConfig, sweep as retention_sweep @@ -29,6 +30,7 @@ from .core.store import Store _ADAPTERS = { "claude": claude_adapter.parse_session, "codex": codex_adapter.parse_session, + "grok": grok_adapter.parse_session, } diff --git a/tests/test_grok_adapter.py b/tests/test_grok_adapter.py new file mode 100644 index 0000000..99c9d75 --- /dev/null +++ b/tests/test_grok_adapter.py @@ -0,0 +1,92 @@ +"""Grok adapter tests (T02): synthetic session dir + real local sessions.""" + +import glob +import json +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from session_memory.adapters.grok import parse_session # noqa: E402 + +REPO_MAP = {"agentic-resources": "helix_forge", "net-kingdom": "netkingdom", + "can-you-assist": "coulomb_social"} + + +def _mk_session(dir_path, sid): + os.makedirs(dir_path, exist_ok=True) + with open(os.path.join(dir_path, "summary.json"), "w") as f: + json.dump({"info": {"id": sid, "cwd": "/home/worsch/agentic-resources"}, + "created_at": "2026-06-06T10:00:00Z", + "last_active_at": "2026-06-06T10:05:00Z", + "current_model_id": "grok-build", "head_branch": "main"}, f) + with open(os.path.join(dir_path, "events.jsonl"), "w") as f: + f.write(json.dumps({"ts": "2026-06-06T10:00:00Z", "type": "turn_started", + "turn_number": 0, "model_id": "grok-build"}) + "\n") + f.write(json.dumps({"ts": "2026-06-06T10:05:00Z", "type": "turn_ended", + "turn_number": 0}) + "\n") + with open(os.path.join(dir_path, "chat_history.jsonl"), "w") as f: + for rec in [ + {"type": "system", "content": "sys prompt"}, + {"type": "user", "content": [{"type": "text", "text": "fix the bug"}]}, + {"type": "reasoning", "content": [{"type": "text", "text": "thinking..."}]}, + {"type": "assistant", "content": ""}, # empty -> skipped + {"type": "tool_result", "content": "The file x.py has been updated"}, + {"type": "assistant", "content": "done"}, + {"type": "tool_result", "content": "6 passed"}, + ]: + f.write(json.dumps(rec) + "\n") + with open(os.path.join(dir_path, "updates.jsonl"), "w") as f: + for u in [ + {"sessionUpdate": "tool_call", "toolCallId": "c1", "title": "edit_file", + "rawInput": {"target_file": "x.py"}}, + {"sessionUpdate": "tool_call", "toolCallId": "c2", "title": "shell", + "rawInput": {"command": "pytest -q"}}, + ]: + f.write(json.dumps({"timestamp": "t", "method": "session/update", + "params": {"sessionId": sid, "update": u}}) + "\n") + + +def test_grok_synthetic_dir(tmp_path): + d = tmp_path / "%2Fhome%2Fworsch%2Fagentic-resources" / "sid-1" + _mk_session(str(d), "sid-1") + + norm = parse_session(str(d / "chat_history.jsonl"), REPO_MAP) + assert norm is not None + s = norm.session + assert s.session_uid == "grok:sid-1" + assert s.flavor == "grok" + assert s.repo == "agentic-resources" and s.domain == "helix_forge" + assert s.model == "grok-build" + assert s.git_branch == "main" + assert s.cost.turns == 1 + assert s.cost.wall_clock_s == 300.0 + + kinds = [e.kind for e in norm.events] + # 4 lifecycle from events.jsonl? no: turn_started + turn_ended = 2 lifecycle + assert kinds.count("lifecycle") == 2 + assert "user_msg" in kinds and "thinking" in kinds and "assistant_msg" in kinds + # paired tool calls recovered names -> edit + test_run, each followed by tool_result + assert "edit" in kinds and "test_run" in kinds + edit = next(e for e in norm.events if e.kind == "edit") + assert edit.tool == "edit_file" + # tool_result after test_run links to it + tr = [e for e in norm.events if e.kind == "tool_result"] + assert len(tr) == 2 + + +def test_real_local_grok_sessions_if_available(): + base = os.path.expanduser("~/.grok/sessions") + chats = glob.glob(os.path.join(base, "*", "*", "chat_history.jsonl")) + if not chats: + return + parsed = 0 + for c in chats: + norm = parse_session(c, REPO_MAP) + if norm is None: + continue + parsed += 1 + assert norm.session.session_uid.startswith("grok:") + seqs = [e.seq for e in norm.events] + assert seqs == sorted(seqs) and len(seqs) == len(set(seqs)) + assert parsed >= 1 diff --git a/workplans/AGENTIC-WP-0003-session-memory-phase1.md b/workplans/AGENTIC-WP-0003-session-memory-phase1.md index 2e79010..eb4fe79 100644 --- a/workplans/AGENTIC-WP-0003-session-memory-phase1.md +++ b/workplans/AGENTIC-WP-0003-session-memory-phase1.md @@ -52,7 +52,7 @@ order (no native DAG). Version-detect on `session_meta.cli_version`. Reuse the ```task id: AGENTIC-WP-0003-T02 -status: progress +status: done priority: high state_hub_task_id: "fe3d7d1c-110e-4f16-8d56-062fa4a651aa" ```