"""OpenAI Codex CLI collector adapter — Tier 0 -> Tier 1 (design §2.2, §4.3). Reads ``$CODEX_HOME/sessions/YYYY/MM/DD/rollout-*.jsonl``. Each line is a ``RolloutLine`` wrapper ``{timestamp, type, payload}``; ``type`` discriminates ``session_meta`` / ``response_item`` / ``event_msg`` / ``turn_context`` / ``compacted``. Codex is **flat** — tool calls and outputs are joined only by ``call_id`` with no parent-ref DAG — so ``seq`` is assigned by temporal (line) order and ``parent_seq`` is set for ``function_call_output`` back to its ``function_call``. """ from __future__ import annotations import os from typing import Any, Optional from ..core.schema import Cost, Session, SessionEvent from .common import ( Normalized, classify_tool, first_line, iter_jsonl, now_iso, resolve_repo, seconds_between, stringify, ) FLAVOR = "codex" def _message_text(payload: dict[str, Any]) -> str: content = payload.get("content") if isinstance(content, str): return content parts = [] if isinstance(content, list): for b in content: if isinstance(b, dict): parts.append(b.get("text") or b.get("output_text") or "") elif isinstance(b, str): parts.append(b) return "\n".join(p for p in parts if p) def _extract_tokens(payload: dict[str, Any]) -> tuple[int, int, int]: """Best-effort (input, output, cache) from a token_count payload. Field shapes vary across Codex versions; probe known locations, else recurse. """ for scope in (payload, payload.get("info") or {}, payload.get("usage") or {}, (payload.get("info") or {}).get("total_token_usage") or {}): if isinstance(scope, dict): i = scope.get("input_tokens") or scope.get("prompt_tokens") o = scope.get("output_tokens") or scope.get("completion_tokens") if i is not None or o is not None: cache = scope.get("cached_input_tokens") or scope.get("cache_read_input_tokens") or 0 return int(i or 0), int(o or 0), int(cache or 0) return 0, 0, 0 def parse_session(path: str, repo_domain_map: Optional[dict[str, str]] = None) -> Optional[Normalized]: repo_domain_map = repo_domain_map or {} records = list(iter_jsonl(path)) if not records: return None session_id: Optional[str] = None cwd = model = cli_version = None timestamps: list[str] = [] events: list[SessionEvent] = [] blobs: dict[str, str] = {} call_seq: dict[str, int] = {} # call_id -> seq of its function_call cost = Cost() seq = 0 def add_event(ts, kind, *, role=None, tool=None, summary=None, body=None, tokens=0, parent_seq=None) -> int: nonlocal seq s = seq seq += 1 payload_ref = None if body: payload_ref = f"blob://{session_id}/{s}" blobs[payload_ref] = body events.append(SessionEvent( session_uid=Session.make_uid(FLAVOR, session_id or "unknown"), seq=s, parent_seq=parent_seq, ts=ts, kind=kind, role=role, tool=tool, summary=(summary or "")[:300] or None, payload_ref=payload_ref, tokens=tokens, )) return s for rec in records: rtype = rec.get("type") ts = rec.get("timestamp") if ts: timestamps.append(ts) payload = rec.get("payload") or {} if rtype == "session_meta": session_id = session_id or payload.get("id") cwd = cwd or payload.get("cwd") model = model or payload.get("model") cli_version = cli_version or payload.get("cli_version") elif rtype == "turn_context": model = model or payload.get("model") elif rtype == "response_item": ptype = payload.get("type") if ptype == "message": role = payload.get("role", "assistant") text = _message_text(payload) kind = "assistant_msg" if role == "assistant" else "user_msg" add_event(ts, kind, role=role, summary=first_line(text), body=text) elif ptype == "function_call": name = payload.get("name", "") args = stringify(payload.get("arguments")) kind = classify_tool(name, args) s = add_event(ts, kind, role="assistant", tool=name, summary=name, body=args) call_id = payload.get("call_id") if call_id: call_seq[call_id] = s elif ptype == "function_call_output": call_id = payload.get("call_id") parent = call_seq.get(call_id) body = stringify(payload.get("output")) add_event(ts, "tool_result", role="tool", tool=None, summary="tool result", body=body, parent_seq=parent) elif ptype == "reasoning": body = _message_text(payload) or stringify(payload.get("summary")) add_event(ts, "thinking", role="assistant", summary="reasoning", body=body) elif rtype == "event_msg": ptype = payload.get("type") if ptype == "task_started": add_event(ts, "lifecycle", summary="task_started") elif ptype == "task_complete": add_event(ts, "completion", summary="task_complete") elif ptype == "token_count": i, o, c = _extract_tokens(payload) cost.input_tokens += i cost.output_tokens += o cost.cache_tokens += c # user_message / agent_message echoes are duplicated by response_item # messages on modern Codex; skipped to avoid double counting. if session_id is None: return None cost.turns = sum(1 for e in events if e.kind == "user_msg") started = min(timestamps) if timestamps else None ended = max(timestamps) if timestamps else None cost.wall_clock_s = seconds_between(started, ended) repo, domain = resolve_repo(cwd, repo_domain_map) session = Session( session_uid=Session.make_uid(FLAVOR, session_id), flavor=FLAVOR, native_session_id=session_id, repo=repo, domain=domain, cwd=cwd, model=model, started_at=started, ended_at=ended, outcome="unknown", cost=cost, source_path=path, source_bytes=os.path.getsize(path) if os.path.exists(path) else 0, discovered_at=now_iso(), ) return Normalized(session=session, events=events, blobs=blobs)