generated from coulomb/repo-seed
session-memory Phase 1: Codex adapter (T01) + multi-file merge (T03)
- adapters/common.py: shared Normalized + helpers (resolve_repo, classify_tool,
jsonl iter, etc.); claude.py refactored to use it (Normalized re-exported)
- adapters/codex.py: rollout {timestamp,type,payload} parser; session_meta/
response_item/event_msg mapping; flat call_id join; token_count cost;
registered in ingest dispatch
- core/store.py: ingest() now merges multi-file sessions by content
fingerprint, appends new events with offset seq (design OQ6); idempotent
- tests/test_codex_adapter.py, tests/test_merge.py
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -11,54 +11,23 @@ that the store persists out-of-line so Tier 1 rows stay light.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Iterable, Optional
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..core.schema import Cost, Session, SessionEvent
|
||||
from .common import ( # noqa: F401 (Normalized re-exported for back-compat)
|
||||
Normalized,
|
||||
classify_tool,
|
||||
first_line as _first_line,
|
||||
iter_jsonl as _iter_records,
|
||||
now_iso as _now,
|
||||
resolve_repo as _resolve_repo,
|
||||
seconds_between as _seconds_between,
|
||||
stringify as _stringify,
|
||||
)
|
||||
|
||||
FLAVOR = "claude"
|
||||
|
||||
# tool_use names that mutate files -> kind "edit"
|
||||
_EDIT_TOOLS = {"Edit", "Write", "NotebookEdit", "MultiEdit"}
|
||||
# crude test-runner detection inside Bash commands -> kind "test_run"
|
||||
_TEST_HINTS = ("pytest", "unittest", "npm test", "npm run test", "go test", "cargo test", "jest", "vitest")
|
||||
|
||||
|
||||
@dataclass
|
||||
class Normalized:
|
||||
session: Session
|
||||
events: list[SessionEvent]
|
||||
blobs: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _iter_records(path: str) -> Iterable[dict[str, Any]]:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
yield json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue # tolerate partial/corrupt trailing lines
|
||||
|
||||
|
||||
def _resolve_repo(cwd: Optional[str], repo_domain_map: dict[str, str]) -> tuple[Optional[str], Optional[str]]:
|
||||
"""cwd -> (repo, domain). repo is the cwd basename; domain via map."""
|
||||
if not cwd:
|
||||
return None, None
|
||||
repo = os.path.basename(cwd.rstrip("/")) or None
|
||||
domain = repo_domain_map.get(repo) if repo else None
|
||||
return repo, domain
|
||||
|
||||
|
||||
def _is_test_command(text: str) -> bool:
|
||||
low = text.lower()
|
||||
return any(h in low for h in _TEST_HINTS)
|
||||
|
||||
|
||||
def _content_blocks(message: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
content = message.get("content")
|
||||
@@ -159,11 +128,8 @@ def parse_session(path: str, repo_domain_map: Optional[dict[str, str]] = None) -
|
||||
name = b.get("name", "")
|
||||
inp = b.get("input", {})
|
||||
body = _stringify(inp)
|
||||
kind = "tool_call"
|
||||
if name in _EDIT_TOOLS:
|
||||
kind = "edit"
|
||||
elif name == "Bash" and _is_test_command(_stringify(inp.get("command", ""))):
|
||||
kind = "test_run"
|
||||
cmd = inp.get("command", "") if isinstance(inp, dict) else ""
|
||||
kind = classify_tool(name, _stringify(cmd))
|
||||
add_event(uuid, parent, ts, kind, role="assistant", tool=name,
|
||||
summary=f"{name}", body=body, sidechain=sidechain)
|
||||
|
||||
@@ -194,35 +160,3 @@ def parse_session(path: str, repo_domain_map: Optional[dict[str, str]] = None) -
|
||||
discovered_at=_now(),
|
||||
)
|
||||
return Normalized(session=session, events=events, blobs=blobs)
|
||||
|
||||
|
||||
# ---- helpers ---------------------------------------------------------------
|
||||
|
||||
def _stringify(v: Any) -> str:
|
||||
if v is None:
|
||||
return ""
|
||||
if isinstance(v, str):
|
||||
return v
|
||||
try:
|
||||
return json.dumps(v, ensure_ascii=False)[:20000]
|
||||
except (TypeError, ValueError):
|
||||
return str(v)[:20000]
|
||||
|
||||
|
||||
def _first_line(text: str) -> str:
|
||||
return (text or "").strip().splitlines()[0] if (text or "").strip() else ""
|
||||
|
||||
|
||||
def _seconds_between(start: Optional[str], end: Optional[str]) -> float:
|
||||
if not start or not end:
|
||||
return 0.0
|
||||
try:
|
||||
a = datetime.fromisoformat(start.replace("Z", "+00:00"))
|
||||
b = datetime.fromisoformat(end.replace("Z", "+00:00"))
|
||||
return max(0.0, (b - a).total_seconds())
|
||||
except ValueError:
|
||||
return 0.0
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
167
session_memory/adapters/codex.py
Normal file
167
session_memory/adapters/codex.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""OpenAI Codex CLI collector adapter — Tier 0 -> Tier 1 (design §2.2, §4.3).
|
||||
|
||||
Reads ``$CODEX_HOME/sessions/YYYY/MM/DD/rollout-*.jsonl``. Each line is a
|
||||
``RolloutLine`` wrapper ``{timestamp, type, payload}``; ``type`` discriminates
|
||||
``session_meta`` / ``response_item`` / ``event_msg`` / ``turn_context`` /
|
||||
``compacted``.
|
||||
|
||||
Codex is **flat** — tool calls and outputs are joined only by ``call_id`` with no
|
||||
parent-ref DAG — so ``seq`` is assigned by temporal (line) order and
|
||||
``parent_seq`` is set for ``function_call_output`` back to its ``function_call``.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..core.schema import Cost, Session, SessionEvent
|
||||
from .common import (
|
||||
Normalized,
|
||||
classify_tool,
|
||||
first_line,
|
||||
iter_jsonl,
|
||||
now_iso,
|
||||
resolve_repo,
|
||||
seconds_between,
|
||||
stringify,
|
||||
)
|
||||
|
||||
FLAVOR = "codex"
|
||||
|
||||
|
||||
def _message_text(payload: dict[str, Any]) -> str:
|
||||
content = payload.get("content")
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
parts = []
|
||||
if isinstance(content, list):
|
||||
for b in content:
|
||||
if isinstance(b, dict):
|
||||
parts.append(b.get("text") or b.get("output_text") or "")
|
||||
elif isinstance(b, str):
|
||||
parts.append(b)
|
||||
return "\n".join(p for p in parts if p)
|
||||
|
||||
|
||||
def _extract_tokens(payload: dict[str, Any]) -> tuple[int, int, int]:
|
||||
"""Best-effort (input, output, cache) from a token_count payload.
|
||||
|
||||
Field shapes vary across Codex versions; probe known locations, else recurse.
|
||||
"""
|
||||
for scope in (payload, payload.get("info") or {}, payload.get("usage") or {},
|
||||
(payload.get("info") or {}).get("total_token_usage") or {}):
|
||||
if isinstance(scope, dict):
|
||||
i = scope.get("input_tokens") or scope.get("prompt_tokens")
|
||||
o = scope.get("output_tokens") or scope.get("completion_tokens")
|
||||
if i is not None or o is not None:
|
||||
cache = scope.get("cached_input_tokens") or scope.get("cache_read_input_tokens") or 0
|
||||
return int(i or 0), int(o or 0), int(cache or 0)
|
||||
return 0, 0, 0
|
||||
|
||||
|
||||
def parse_session(path: str, repo_domain_map: Optional[dict[str, str]] = None) -> Optional[Normalized]:
|
||||
repo_domain_map = repo_domain_map or {}
|
||||
records = list(iter_jsonl(path))
|
||||
if not records:
|
||||
return None
|
||||
|
||||
session_id: Optional[str] = None
|
||||
cwd = model = cli_version = None
|
||||
timestamps: list[str] = []
|
||||
events: list[SessionEvent] = []
|
||||
blobs: dict[str, str] = {}
|
||||
call_seq: dict[str, int] = {} # call_id -> seq of its function_call
|
||||
cost = Cost()
|
||||
seq = 0
|
||||
|
||||
def add_event(ts, kind, *, role=None, tool=None, summary=None, body=None,
|
||||
tokens=0, parent_seq=None) -> int:
|
||||
nonlocal seq
|
||||
s = seq
|
||||
seq += 1
|
||||
payload_ref = None
|
||||
if body:
|
||||
payload_ref = f"blob://{session_id}/{s}"
|
||||
blobs[payload_ref] = body
|
||||
events.append(SessionEvent(
|
||||
session_uid=Session.make_uid(FLAVOR, session_id or "unknown"),
|
||||
seq=s, parent_seq=parent_seq, ts=ts, kind=kind, role=role, tool=tool,
|
||||
summary=(summary or "")[:300] or None, payload_ref=payload_ref, tokens=tokens,
|
||||
))
|
||||
return s
|
||||
|
||||
for rec in records:
|
||||
rtype = rec.get("type")
|
||||
ts = rec.get("timestamp")
|
||||
if ts:
|
||||
timestamps.append(ts)
|
||||
payload = rec.get("payload") or {}
|
||||
|
||||
if rtype == "session_meta":
|
||||
session_id = session_id or payload.get("id")
|
||||
cwd = cwd or payload.get("cwd")
|
||||
model = model or payload.get("model")
|
||||
cli_version = cli_version or payload.get("cli_version")
|
||||
|
||||
elif rtype == "turn_context":
|
||||
model = model or payload.get("model")
|
||||
|
||||
elif rtype == "response_item":
|
||||
ptype = payload.get("type")
|
||||
if ptype == "message":
|
||||
role = payload.get("role", "assistant")
|
||||
text = _message_text(payload)
|
||||
kind = "assistant_msg" if role == "assistant" else "user_msg"
|
||||
add_event(ts, kind, role=role, summary=first_line(text), body=text)
|
||||
elif ptype == "function_call":
|
||||
name = payload.get("name", "")
|
||||
args = stringify(payload.get("arguments"))
|
||||
kind = classify_tool(name, args)
|
||||
s = add_event(ts, kind, role="assistant", tool=name,
|
||||
summary=name, body=args)
|
||||
call_id = payload.get("call_id")
|
||||
if call_id:
|
||||
call_seq[call_id] = s
|
||||
elif ptype == "function_call_output":
|
||||
call_id = payload.get("call_id")
|
||||
parent = call_seq.get(call_id)
|
||||
body = stringify(payload.get("output"))
|
||||
add_event(ts, "tool_result", role="tool", tool=None,
|
||||
summary="tool result", body=body, parent_seq=parent)
|
||||
elif ptype == "reasoning":
|
||||
body = _message_text(payload) or stringify(payload.get("summary"))
|
||||
add_event(ts, "thinking", role="assistant", summary="reasoning", body=body)
|
||||
|
||||
elif rtype == "event_msg":
|
||||
ptype = payload.get("type")
|
||||
if ptype == "task_started":
|
||||
add_event(ts, "lifecycle", summary="task_started")
|
||||
elif ptype == "task_complete":
|
||||
add_event(ts, "completion", summary="task_complete")
|
||||
elif ptype == "token_count":
|
||||
i, o, c = _extract_tokens(payload)
|
||||
cost.input_tokens += i
|
||||
cost.output_tokens += o
|
||||
cost.cache_tokens += c
|
||||
# user_message / agent_message echoes are duplicated by response_item
|
||||
# messages on modern Codex; skipped to avoid double counting.
|
||||
|
||||
if session_id is None:
|
||||
return None
|
||||
|
||||
cost.turns = sum(1 for e in events if e.kind == "user_msg")
|
||||
started = min(timestamps) if timestamps else None
|
||||
ended = max(timestamps) if timestamps else None
|
||||
cost.wall_clock_s = seconds_between(started, ended)
|
||||
|
||||
repo, domain = resolve_repo(cwd, repo_domain_map)
|
||||
session = Session(
|
||||
session_uid=Session.make_uid(FLAVOR, session_id),
|
||||
flavor=FLAVOR, native_session_id=session_id,
|
||||
repo=repo, domain=domain, cwd=cwd, model=model,
|
||||
started_at=started, ended_at=ended, outcome="unknown", cost=cost,
|
||||
source_path=path, source_bytes=os.path.getsize(path) if os.path.exists(path) else 0,
|
||||
discovered_at=now_iso(),
|
||||
)
|
||||
return Normalized(session=session, events=events, blobs=blobs)
|
||||
100
session_memory/adapters/common.py
Normal file
100
session_memory/adapters/common.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""Shared adapter helpers (Tier 0 -> Tier 1).
|
||||
|
||||
The ``Normalized`` bundle contract and small flavor-agnostic helpers used by every
|
||||
collector adapter. Per-flavor parsing lives in the individual adapter modules.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
from ..core.schema import Session, SessionEvent
|
||||
|
||||
# tool names that mutate files -> kind "edit" (union across flavors)
|
||||
EDIT_TOOLS = {
|
||||
"Edit", "Write", "NotebookEdit", "MultiEdit", # Claude
|
||||
"apply_patch", "write_file", "edit_file", # Codex / Grok variants
|
||||
}
|
||||
# substrings in a shell/tool command that indicate a test run -> kind "test_run"
|
||||
TEST_HINTS = (
|
||||
"pytest", "unittest", "npm test", "npm run test", "go test",
|
||||
"cargo test", "jest", "vitest", "make test", "tox",
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class Normalized:
|
||||
session: Session
|
||||
events: list[SessionEvent]
|
||||
blobs: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
def resolve_repo(cwd: Optional[str], repo_domain_map: dict[str, str]) -> tuple[Optional[str], Optional[str]]:
|
||||
"""cwd -> (repo, domain). repo is the cwd basename; domain via map."""
|
||||
if not cwd:
|
||||
return None, None
|
||||
repo = os.path.basename(cwd.rstrip("/")) or None
|
||||
domain = repo_domain_map.get(repo) if repo else None
|
||||
return repo, domain
|
||||
|
||||
|
||||
def is_test_command(text: str) -> bool:
|
||||
low = (text or "").lower()
|
||||
return any(h in low for h in TEST_HINTS)
|
||||
|
||||
|
||||
def classify_tool(name: str, command_text: str = "") -> str:
|
||||
"""Map a tool invocation to an event kind: edit | test_run | tool_call."""
|
||||
if name in EDIT_TOOLS:
|
||||
return "edit"
|
||||
if is_test_command(command_text) or is_test_command(name):
|
||||
return "test_run"
|
||||
return "tool_call"
|
||||
|
||||
|
||||
def stringify(v: Any, limit: int = 20000) -> str:
|
||||
if v is None:
|
||||
return ""
|
||||
if isinstance(v, str):
|
||||
return v[:limit]
|
||||
try:
|
||||
return json.dumps(v, ensure_ascii=False)[:limit]
|
||||
except (TypeError, ValueError):
|
||||
return str(v)[:limit]
|
||||
|
||||
|
||||
def first_line(text: str) -> str:
|
||||
t = (text or "").strip()
|
||||
return t.splitlines()[0] if t else ""
|
||||
|
||||
|
||||
def seconds_between(start: Optional[str], end: Optional[str]) -> float:
|
||||
if not start or not end:
|
||||
return 0.0
|
||||
try:
|
||||
a = datetime.fromisoformat(start.replace("Z", "+00:00"))
|
||||
b = datetime.fromisoformat(end.replace("Z", "+00:00"))
|
||||
return max(0.0, (b - a).total_seconds())
|
||||
except ValueError:
|
||||
return 0.0
|
||||
|
||||
|
||||
def iter_jsonl(path: str):
|
||||
"""Yield parsed JSON objects from a JSONL file, tolerating bad lines."""
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
yield json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
def now_iso() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
@@ -12,6 +12,7 @@ Tier 2 digest — the invariant that makes budget-based retention non-lossy.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -28,6 +29,18 @@ def _now() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _fingerprint(ev: SessionEvent, body: Optional[str]) -> str:
|
||||
"""Stable content fingerprint, independent of seq/payload_ref, for dedup."""
|
||||
h = hashlib.sha1()
|
||||
parts = [ev.ts or "", ev.kind, ev.role or "", ev.tool or "", ev.summary or "",
|
||||
ev.role or "", str(ev.is_sidechain)]
|
||||
h.update("\x1f".join(parts).encode("utf-8"))
|
||||
if body is not None:
|
||||
h.update(b"\x1e")
|
||||
h.update(body.encode("utf-8"))
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
class Store:
|
||||
def __init__(self, db_path: str, blob_dir: str):
|
||||
self.db_path = db_path
|
||||
@@ -121,14 +134,75 @@ class Store:
|
||||
self.db.commit()
|
||||
return total
|
||||
|
||||
def ingest(self, bundle) -> None:
|
||||
"""Persist a full Normalized bundle (session + events + blobs)."""
|
||||
def ingest(self, bundle) -> int:
|
||||
"""Persist a Normalized bundle, merging into any existing session.
|
||||
|
||||
Multiple files can map to one ``session_uid`` (Claude resume/sidechains;
|
||||
Grok multi-file dirs). Events are de-duplicated by content fingerprint and
|
||||
genuinely-new events are appended with offset ``seq`` (design OQ6 / T03).
|
||||
Returns the number of new events written. Idempotent: re-ingesting the
|
||||
same bundle adds nothing.
|
||||
"""
|
||||
s = bundle.session
|
||||
if s.ingested_at is None:
|
||||
s.ingested_at = _now()
|
||||
self.upsert_session(s)
|
||||
self.upsert_events(bundle.events)
|
||||
self.write_blobs(s.session_uid, bundle.blobs)
|
||||
existing = self.get_session(s.session_uid)
|
||||
if existing is None:
|
||||
if s.ingested_at is None:
|
||||
s.ingested_at = _now()
|
||||
self.upsert_session(s)
|
||||
# known fingerprints + current max seq for this session
|
||||
seen = self._event_fingerprints(s.session_uid)
|
||||
next_seq = self._max_seq(s.session_uid) + 1
|
||||
|
||||
new_events: list[SessionEvent] = []
|
||||
new_blobs: dict[str, str] = {}
|
||||
old_to_new: dict[int, int] = {}
|
||||
for ev in bundle.events:
|
||||
body = bundle.blobs.get(ev.payload_ref) if ev.payload_ref else None
|
||||
fp = _fingerprint(ev, body)
|
||||
if fp in seen:
|
||||
continue # already stored (prior file or prior sweep)
|
||||
new_seq = next_seq
|
||||
next_seq += 1
|
||||
old_to_new[ev.seq] = new_seq
|
||||
# remap parent within this bundle; cross-file parents become None
|
||||
parent = old_to_new.get(ev.parent_seq) if ev.parent_seq is not None else None
|
||||
ref = None
|
||||
if body is not None:
|
||||
ref = f"blob://{s.session_uid}/{new_seq}"
|
||||
new_blobs[ref] = body
|
||||
merged = SessionEvent(
|
||||
session_uid=s.session_uid, seq=new_seq, parent_seq=parent, ts=ev.ts,
|
||||
kind=ev.kind, role=ev.role, tool=ev.tool, summary=ev.summary,
|
||||
payload_ref=ref, tokens=ev.tokens, is_sidechain=ev.is_sidechain,
|
||||
)
|
||||
new_events.append(merged)
|
||||
seen.add(fp)
|
||||
|
||||
if new_events:
|
||||
self.upsert_events(new_events)
|
||||
self.write_blobs(s.session_uid, new_blobs)
|
||||
return len(new_events)
|
||||
|
||||
def _max_seq(self, session_uid: str) -> int:
|
||||
row = self.db.execute(
|
||||
"SELECT COALESCE(MAX(seq), -1) m FROM events WHERE session_uid=?", (session_uid,)
|
||||
).fetchone()
|
||||
return int(row["m"])
|
||||
|
||||
def _event_fingerprints(self, session_uid: str) -> set[str]:
|
||||
fps: set[str] = set()
|
||||
for e in self.get_events(session_uid):
|
||||
body = None
|
||||
if e.payload_ref:
|
||||
r = self.db.execute("SELECT path FROM blobs WHERE ref=?", (e.payload_ref,)).fetchone()
|
||||
if r:
|
||||
try:
|
||||
with open(r["path"], "r", encoding="utf-8") as f:
|
||||
body = f.read()
|
||||
except OSError:
|
||||
body = None
|
||||
fps.add(_fingerprint(e, body))
|
||||
return fps
|
||||
|
||||
# ---- Tier 2 (digest) ---------------------------------------------------
|
||||
|
||||
|
||||
@@ -19,13 +19,17 @@ from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from .adapters import claude as claude_adapter
|
||||
from .adapters import codex as codex_adapter
|
||||
from .core import digest as digest_mod
|
||||
from .core.cursor import Cursors
|
||||
from .core.retention import RetentionConfig, sweep as retention_sweep
|
||||
from .core.store import Store
|
||||
|
||||
# adapter dispatch by source name
|
||||
_ADAPTERS = {"claude": claude_adapter.parse_session}
|
||||
_ADAPTERS = {
|
||||
"claude": claude_adapter.parse_session,
|
||||
"codex": codex_adapter.parse_session,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
|
||||
86
tests/test_codex_adapter.py
Normal file
86
tests/test_codex_adapter.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""Codex adapter tests (T01): synthetic rollout fixture."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from session_memory.adapters.codex import parse_session # noqa: E402
|
||||
|
||||
REPO_MAP = {"agentic-resources": "helix_forge"}
|
||||
|
||||
|
||||
def _rollout(path, lines):
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
for ln in lines:
|
||||
f.write(json.dumps(ln) + "\n")
|
||||
|
||||
|
||||
def test_codex_rollout_parse(tmp_path):
|
||||
p = tmp_path / "rollout-2026-06-06-abc.jsonl"
|
||||
_rollout(p, [
|
||||
{"timestamp": "2026-06-06T10:00:00Z", "type": "session_meta",
|
||||
"payload": {"id": "cdx-1", "cwd": "/home/worsch/agentic-resources",
|
||||
"model_provider": "openai", "cli_version": "0.44.0", "model": "gpt-5-codex"}},
|
||||
{"timestamp": "2026-06-06T10:00:01Z", "type": "turn_context",
|
||||
"payload": {"model": "gpt-5-codex", "approval_policy": "on-request"}},
|
||||
{"timestamp": "2026-06-06T10:00:02Z", "type": "event_msg",
|
||||
"payload": {"type": "task_started"}},
|
||||
{"timestamp": "2026-06-06T10:00:03Z", "type": "response_item",
|
||||
"payload": {"type": "message", "role": "user",
|
||||
"content": [{"type": "input_text", "text": "fix the bug"}]}},
|
||||
{"timestamp": "2026-06-06T10:00:04Z", "type": "response_item",
|
||||
"payload": {"type": "reasoning", "summary": "think about it"}},
|
||||
{"timestamp": "2026-06-06T10:00:05Z", "type": "response_item",
|
||||
"payload": {"type": "function_call", "name": "apply_patch",
|
||||
"arguments": "{\"path\":\"x.py\"}", "call_id": "call_1"}},
|
||||
{"timestamp": "2026-06-06T10:00:06Z", "type": "response_item",
|
||||
"payload": {"type": "function_call", "name": "shell",
|
||||
"arguments": "{\"command\":\"pytest -q\"}", "call_id": "call_2"}},
|
||||
{"timestamp": "2026-06-06T10:00:07Z", "type": "response_item",
|
||||
"payload": {"type": "function_call_output", "call_id": "call_2", "output": "2 passed"}},
|
||||
{"timestamp": "2026-06-06T10:00:08Z", "type": "response_item",
|
||||
"payload": {"type": "message", "role": "assistant",
|
||||
"content": [{"type": "output_text", "text": "done"}]}},
|
||||
{"timestamp": "2026-06-06T10:00:09Z", "type": "event_msg",
|
||||
"payload": {"type": "token_count",
|
||||
"info": {"total_token_usage": {"input_tokens": 200, "output_tokens": 30,
|
||||
"cached_input_tokens": 15}}}},
|
||||
{"timestamp": "2026-06-06T10:00:10Z", "type": "event_msg",
|
||||
"payload": {"type": "task_complete"}},
|
||||
])
|
||||
|
||||
norm = parse_session(str(p), REPO_MAP)
|
||||
assert norm is not None
|
||||
s = norm.session
|
||||
assert s.session_uid == "codex:cdx-1"
|
||||
assert s.flavor == "codex"
|
||||
assert s.repo == "agentic-resources" and s.domain == "helix_forge"
|
||||
assert s.model == "gpt-5-codex"
|
||||
assert s.cost.input_tokens == 200 and s.cost.output_tokens == 30 and s.cost.cache_tokens == 15
|
||||
assert s.cost.turns == 1
|
||||
assert s.cost.wall_clock_s == 10.0
|
||||
|
||||
kinds = [e.kind for e in norm.events]
|
||||
assert kinds == ["lifecycle", "user_msg", "thinking", "edit", "test_run",
|
||||
"tool_result", "assistant_msg", "completion"]
|
||||
|
||||
# flat linkage: function_call_output links to its function_call by call_id
|
||||
out = next(e for e in norm.events if e.kind == "tool_result")
|
||||
test_call = next(e for e in norm.events if e.kind == "test_run")
|
||||
assert out.parent_seq == test_call.seq
|
||||
|
||||
# apply_patch classified as edit; pytest as test_run
|
||||
edit = next(e for e in norm.events if e.kind == "edit")
|
||||
assert edit.tool == "apply_patch"
|
||||
|
||||
|
||||
def test_codex_empty_or_no_meta_returns_none(tmp_path):
|
||||
p = tmp_path / "rollout-empty.jsonl"
|
||||
p.write_text("")
|
||||
assert parse_session(str(p), REPO_MAP) is None
|
||||
|
||||
p2 = tmp_path / "rollout-nometa.jsonl"
|
||||
_rollout(p2, [{"timestamp": "t", "type": "event_msg", "payload": {"type": "task_started"}}])
|
||||
assert parse_session(str(p2), REPO_MAP) is None # no session_meta -> no id
|
||||
66
tests/test_merge.py
Normal file
66
tests/test_merge.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Multi-file session merge tests (T03)."""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from session_memory.adapters.common import Normalized # noqa: E402
|
||||
from session_memory.core.schema import Session, SessionEvent # noqa: E402
|
||||
from session_memory.core.store import Store # noqa: E402
|
||||
|
||||
|
||||
def _part(native, kinds, base_blob="b"):
|
||||
uid = Session.make_uid("claude", native)
|
||||
s = Session(session_uid=uid, flavor="claude", native_session_id=native)
|
||||
events, blobs = [], {}
|
||||
for i, k in enumerate(kinds):
|
||||
ref = f"blob://{native}/{i}"
|
||||
events.append(SessionEvent(session_uid=uid, seq=i, parent_seq=(i - 1 if i else None),
|
||||
kind=k, ts=f"2026-06-06T10:0{i}:00Z", payload_ref=ref))
|
||||
blobs[ref] = f"{base_blob}-{k}-{i}"
|
||||
return Normalized(session=s, events=events, blobs=blobs)
|
||||
|
||||
|
||||
def test_second_file_appends_not_overwrites(tmp_path):
|
||||
st = Store(str(tmp_path / "m.db"), str(tmp_path / "blobs"))
|
||||
uid = Session.make_uid("claude", "s1")
|
||||
|
||||
# file 1: 3 events (seq 0..2)
|
||||
n1 = _part("s1", ["user_msg", "assistant_msg", "tool_call"])
|
||||
added1 = st.ingest(n1)
|
||||
assert added1 == 3
|
||||
assert st.count_events(uid) == 3
|
||||
|
||||
# file 2 for the SAME session: repeats event 0 + adds 2 new (continuation)
|
||||
n2 = _part("s1", ["user_msg", "edit", "completion"])
|
||||
# make the first event identical to file1's first event so it dedups
|
||||
n2.events[0].kind = "user_msg"
|
||||
n2.events[0].ts = "2026-06-06T10:00:00Z"
|
||||
n2.blobs[n2.events[0].payload_ref] = "b-user_msg-0"
|
||||
added2 = st.ingest(n2)
|
||||
|
||||
# only the 2 genuinely-new events appended; total grows additively
|
||||
assert added2 == 2
|
||||
assert st.count_events(uid) == 5
|
||||
seqs = [e.seq for e in st.get_events(uid)]
|
||||
assert seqs == [0, 1, 2, 3, 4] # contiguous, offset
|
||||
|
||||
|
||||
def test_reingest_same_bundle_is_idempotent(tmp_path):
|
||||
st = Store(str(tmp_path / "m.db"), str(tmp_path / "blobs"))
|
||||
uid = Session.make_uid("claude", "s2")
|
||||
n = _part("s2", ["user_msg", "assistant_msg"])
|
||||
assert st.ingest(n) == 2
|
||||
assert st.ingest(n) == 0 # nothing new on re-run
|
||||
assert st.count_events(uid) == 2
|
||||
|
||||
|
||||
def test_appended_event_parent_remapped_within_part(tmp_path):
|
||||
st = Store(str(tmp_path / "m.db"), str(tmp_path / "blobs"))
|
||||
uid = Session.make_uid("claude", "s3")
|
||||
st.ingest(_part("s3", ["user_msg", "assistant_msg"])) # seq 0,1
|
||||
st.ingest(_part("s3", ["x_unused"]) if False else _part("s3", ["thinking", "edit"])) # new 2,3
|
||||
events = {e.seq: e for e in st.get_events(uid)}
|
||||
# the 'edit' (seq 3) had parent_seq=0 within its part -> remapped to its part's first new seq (2)
|
||||
assert events[3].parent_seq == 2
|
||||
@@ -32,7 +32,7 @@ against known structures, not discovered ones.
|
||||
|
||||
```task
|
||||
id: AGENTIC-WP-0003-T01
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "91264fd4-ba99-4add-b317-e2320c3c932c"
|
||||
```
|
||||
@@ -52,7 +52,7 @@ order (no native DAG). Version-detect on `session_meta.cli_version`. Reuse the
|
||||
|
||||
```task
|
||||
id: AGENTIC-WP-0003-T02
|
||||
status: todo
|
||||
status: progress
|
||||
priority: high
|
||||
state_hub_task_id: "fe3d7d1c-110e-4f16-8d56-062fa4a651aa"
|
||||
```
|
||||
@@ -69,7 +69,7 @@ sessions on this workstation plus a synthetic dir fixture.
|
||||
|
||||
```task
|
||||
id: AGENTIC-WP-0003-T03
|
||||
status: todo
|
||||
status: done
|
||||
priority: medium
|
||||
state_hub_task_id: "c4acfb63-84cd-4299-a44d-91bb6857fa88"
|
||||
```
|
||||
|
||||
Reference in New Issue
Block a user