session-memory Phase 1: Codex adapter (T01) + multi-file merge (T03)

- adapters/common.py: shared Normalized + helpers (resolve_repo, classify_tool,
  jsonl iter, etc.); claude.py refactored to use it (Normalized re-exported)
- adapters/codex.py: rollout {timestamp,type,payload} parser; session_meta/
  response_item/event_msg mapping; flat call_id join; token_count cost;
  registered in ingest dispatch
- core/store.py: ingest() now merges multi-file sessions by content
  fingerprint, appends new events with offset seq (design OQ6); idempotent
- tests/test_codex_adapter.py, tests/test_merge.py

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-06 21:55:32 +02:00
parent 5aea22f24f
commit bc11cb9aec
8 changed files with 521 additions and 90 deletions

View File

@@ -12,6 +12,7 @@ Tier 2 digest — the invariant that makes budget-based retention non-lossy.
from __future__ import annotations
import hashlib
import json
import os
import re
@@ -28,6 +29,18 @@ def _now() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
def _fingerprint(ev: SessionEvent, body: Optional[str]) -> str:
"""Stable content fingerprint, independent of seq/payload_ref, for dedup."""
h = hashlib.sha1()
parts = [ev.ts or "", ev.kind, ev.role or "", ev.tool or "", ev.summary or "",
ev.role or "", str(ev.is_sidechain)]
h.update("\x1f".join(parts).encode("utf-8"))
if body is not None:
h.update(b"\x1e")
h.update(body.encode("utf-8"))
return h.hexdigest()
class Store:
def __init__(self, db_path: str, blob_dir: str):
self.db_path = db_path
@@ -121,14 +134,75 @@ class Store:
self.db.commit()
return total
def ingest(self, bundle) -> None:
"""Persist a full Normalized bundle (session + events + blobs)."""
def ingest(self, bundle) -> int:
"""Persist a Normalized bundle, merging into any existing session.
Multiple files can map to one ``session_uid`` (Claude resume/sidechains;
Grok multi-file dirs). Events are de-duplicated by content fingerprint and
genuinely-new events are appended with offset ``seq`` (design OQ6 / T03).
Returns the number of new events written. Idempotent: re-ingesting the
same bundle adds nothing.
"""
s = bundle.session
if s.ingested_at is None:
s.ingested_at = _now()
self.upsert_session(s)
self.upsert_events(bundle.events)
self.write_blobs(s.session_uid, bundle.blobs)
existing = self.get_session(s.session_uid)
if existing is None:
if s.ingested_at is None:
s.ingested_at = _now()
self.upsert_session(s)
# known fingerprints + current max seq for this session
seen = self._event_fingerprints(s.session_uid)
next_seq = self._max_seq(s.session_uid) + 1
new_events: list[SessionEvent] = []
new_blobs: dict[str, str] = {}
old_to_new: dict[int, int] = {}
for ev in bundle.events:
body = bundle.blobs.get(ev.payload_ref) if ev.payload_ref else None
fp = _fingerprint(ev, body)
if fp in seen:
continue # already stored (prior file or prior sweep)
new_seq = next_seq
next_seq += 1
old_to_new[ev.seq] = new_seq
# remap parent within this bundle; cross-file parents become None
parent = old_to_new.get(ev.parent_seq) if ev.parent_seq is not None else None
ref = None
if body is not None:
ref = f"blob://{s.session_uid}/{new_seq}"
new_blobs[ref] = body
merged = SessionEvent(
session_uid=s.session_uid, seq=new_seq, parent_seq=parent, ts=ev.ts,
kind=ev.kind, role=ev.role, tool=ev.tool, summary=ev.summary,
payload_ref=ref, tokens=ev.tokens, is_sidechain=ev.is_sidechain,
)
new_events.append(merged)
seen.add(fp)
if new_events:
self.upsert_events(new_events)
self.write_blobs(s.session_uid, new_blobs)
return len(new_events)
def _max_seq(self, session_uid: str) -> int:
row = self.db.execute(
"SELECT COALESCE(MAX(seq), -1) m FROM events WHERE session_uid=?", (session_uid,)
).fetchone()
return int(row["m"])
def _event_fingerprints(self, session_uid: str) -> set[str]:
fps: set[str] = set()
for e in self.get_events(session_uid):
body = None
if e.payload_ref:
r = self.db.execute("SELECT path FROM blobs WHERE ref=?", (e.payload_ref,)).fetchone()
if r:
try:
with open(r["path"], "r", encoding="utf-8") as f:
body = f.read()
except OSError:
body = None
fps.add(_fingerprint(e, body))
return fps
# ---- Tier 2 (digest) ---------------------------------------------------