generated from coulomb/repo-seed
session-memory Phase 1: Codex adapter (T01) + multi-file merge (T03)
- adapters/common.py: shared Normalized + helpers (resolve_repo, classify_tool,
jsonl iter, etc.); claude.py refactored to use it (Normalized re-exported)
- adapters/codex.py: rollout {timestamp,type,payload} parser; session_meta/
response_item/event_msg mapping; flat call_id join; token_count cost;
registered in ingest dispatch
- core/store.py: ingest() now merges multi-file sessions by content
fingerprint, appends new events with offset seq (design OQ6); idempotent
- tests/test_codex_adapter.py, tests/test_merge.py
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
167
session_memory/adapters/codex.py
Normal file
167
session_memory/adapters/codex.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""OpenAI Codex CLI collector adapter — Tier 0 -> Tier 1 (design §2.2, §4.3).
|
||||
|
||||
Reads ``$CODEX_HOME/sessions/YYYY/MM/DD/rollout-*.jsonl``. Each line is a
|
||||
``RolloutLine`` wrapper ``{timestamp, type, payload}``; ``type`` discriminates
|
||||
``session_meta`` / ``response_item`` / ``event_msg`` / ``turn_context`` /
|
||||
``compacted``.
|
||||
|
||||
Codex is **flat** — tool calls and outputs are joined only by ``call_id`` with no
|
||||
parent-ref DAG — so ``seq`` is assigned by temporal (line) order and
|
||||
``parent_seq`` is set for ``function_call_output`` back to its ``function_call``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..core.schema import Cost, Session, SessionEvent
|
||||
from .common import (
|
||||
Normalized,
|
||||
classify_tool,
|
||||
first_line,
|
||||
iter_jsonl,
|
||||
now_iso,
|
||||
resolve_repo,
|
||||
seconds_between,
|
||||
stringify,
|
||||
)
|
||||
|
||||
FLAVOR = "codex"
|
||||
|
||||
|
||||
def _message_text(payload: dict[str, Any]) -> str:
|
||||
content = payload.get("content")
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
parts = []
|
||||
if isinstance(content, list):
|
||||
for b in content:
|
||||
if isinstance(b, dict):
|
||||
parts.append(b.get("text") or b.get("output_text") or "")
|
||||
elif isinstance(b, str):
|
||||
parts.append(b)
|
||||
return "\n".join(p for p in parts if p)
|
||||
|
||||
|
||||
def _extract_tokens(payload: dict[str, Any]) -> tuple[int, int, int]:
|
||||
"""Best-effort (input, output, cache) from a token_count payload.
|
||||
|
||||
Field shapes vary across Codex versions; probe known locations, else recurse.
|
||||
"""
|
||||
for scope in (payload, payload.get("info") or {}, payload.get("usage") or {},
|
||||
(payload.get("info") or {}).get("total_token_usage") or {}):
|
||||
if isinstance(scope, dict):
|
||||
i = scope.get("input_tokens") or scope.get("prompt_tokens")
|
||||
o = scope.get("output_tokens") or scope.get("completion_tokens")
|
||||
if i is not None or o is not None:
|
||||
cache = scope.get("cached_input_tokens") or scope.get("cache_read_input_tokens") or 0
|
||||
return int(i or 0), int(o or 0), int(cache or 0)
|
||||
return 0, 0, 0
|
||||
|
||||
|
||||
def parse_session(path: str, repo_domain_map: Optional[dict[str, str]] = None) -> Optional[Normalized]:
|
||||
repo_domain_map = repo_domain_map or {}
|
||||
records = list(iter_jsonl(path))
|
||||
if not records:
|
||||
return None
|
||||
|
||||
session_id: Optional[str] = None
|
||||
cwd = model = cli_version = None
|
||||
timestamps: list[str] = []
|
||||
events: list[SessionEvent] = []
|
||||
blobs: dict[str, str] = {}
|
||||
call_seq: dict[str, int] = {} # call_id -> seq of its function_call
|
||||
cost = Cost()
|
||||
seq = 0
|
||||
|
||||
def add_event(ts, kind, *, role=None, tool=None, summary=None, body=None,
|
||||
tokens=0, parent_seq=None) -> int:
|
||||
nonlocal seq
|
||||
s = seq
|
||||
seq += 1
|
||||
payload_ref = None
|
||||
if body:
|
||||
payload_ref = f"blob://{session_id}/{s}"
|
||||
blobs[payload_ref] = body
|
||||
events.append(SessionEvent(
|
||||
session_uid=Session.make_uid(FLAVOR, session_id or "unknown"),
|
||||
seq=s, parent_seq=parent_seq, ts=ts, kind=kind, role=role, tool=tool,
|
||||
summary=(summary or "")[:300] or None, payload_ref=payload_ref, tokens=tokens,
|
||||
))
|
||||
return s
|
||||
|
||||
for rec in records:
|
||||
rtype = rec.get("type")
|
||||
ts = rec.get("timestamp")
|
||||
if ts:
|
||||
timestamps.append(ts)
|
||||
payload = rec.get("payload") or {}
|
||||
|
||||
if rtype == "session_meta":
|
||||
session_id = session_id or payload.get("id")
|
||||
cwd = cwd or payload.get("cwd")
|
||||
model = model or payload.get("model")
|
||||
cli_version = cli_version or payload.get("cli_version")
|
||||
|
||||
elif rtype == "turn_context":
|
||||
model = model or payload.get("model")
|
||||
|
||||
elif rtype == "response_item":
|
||||
ptype = payload.get("type")
|
||||
if ptype == "message":
|
||||
role = payload.get("role", "assistant")
|
||||
text = _message_text(payload)
|
||||
kind = "assistant_msg" if role == "assistant" else "user_msg"
|
||||
add_event(ts, kind, role=role, summary=first_line(text), body=text)
|
||||
elif ptype == "function_call":
|
||||
name = payload.get("name", "")
|
||||
args = stringify(payload.get("arguments"))
|
||||
kind = classify_tool(name, args)
|
||||
s = add_event(ts, kind, role="assistant", tool=name,
|
||||
summary=name, body=args)
|
||||
call_id = payload.get("call_id")
|
||||
if call_id:
|
||||
call_seq[call_id] = s
|
||||
elif ptype == "function_call_output":
|
||||
call_id = payload.get("call_id")
|
||||
parent = call_seq.get(call_id)
|
||||
body = stringify(payload.get("output"))
|
||||
add_event(ts, "tool_result", role="tool", tool=None,
|
||||
summary="tool result", body=body, parent_seq=parent)
|
||||
elif ptype == "reasoning":
|
||||
body = _message_text(payload) or stringify(payload.get("summary"))
|
||||
add_event(ts, "thinking", role="assistant", summary="reasoning", body=body)
|
||||
|
||||
elif rtype == "event_msg":
|
||||
ptype = payload.get("type")
|
||||
if ptype == "task_started":
|
||||
add_event(ts, "lifecycle", summary="task_started")
|
||||
elif ptype == "task_complete":
|
||||
add_event(ts, "completion", summary="task_complete")
|
||||
elif ptype == "token_count":
|
||||
i, o, c = _extract_tokens(payload)
|
||||
cost.input_tokens += i
|
||||
cost.output_tokens += o
|
||||
cost.cache_tokens += c
|
||||
# user_message / agent_message echoes are duplicated by response_item
|
||||
# messages on modern Codex; skipped to avoid double counting.
|
||||
|
||||
if session_id is None:
|
||||
return None
|
||||
|
||||
cost.turns = sum(1 for e in events if e.kind == "user_msg")
|
||||
started = min(timestamps) if timestamps else None
|
||||
ended = max(timestamps) if timestamps else None
|
||||
cost.wall_clock_s = seconds_between(started, ended)
|
||||
|
||||
repo, domain = resolve_repo(cwd, repo_domain_map)
|
||||
session = Session(
|
||||
session_uid=Session.make_uid(FLAVOR, session_id),
|
||||
flavor=FLAVOR, native_session_id=session_id,
|
||||
repo=repo, domain=domain, cwd=cwd, model=model,
|
||||
started_at=started, ended_at=ended, outcome="unknown", cost=cost,
|
||||
source_path=path, source_bytes=os.path.getsize(path) if os.path.exists(path) else 0,
|
||||
discovered_at=now_iso(),
|
||||
)
|
||||
return Normalized(session=session, events=events, blobs=blobs)
|
||||
Reference in New Issue
Block a user