generated from coulomb/repo-seed
session-memory Phase 1: Codex adapter (T01) + multi-file merge (T03)
- adapters/common.py: shared Normalized + helpers (resolve_repo, classify_tool,
jsonl iter, etc.); claude.py refactored to use it (Normalized re-exported)
- adapters/codex.py: rollout {timestamp,type,payload} parser; session_meta/
response_item/event_msg mapping; flat call_id join; token_count cost;
registered in ingest dispatch
- core/store.py: ingest() now merges multi-file sessions by content
fingerprint, appends new events with offset seq (design OQ6); idempotent
- tests/test_codex_adapter.py, tests/test_merge.py
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ Tier 2 digest — the invariant that makes budget-based retention non-lossy.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
@@ -28,6 +29,18 @@ def _now() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _fingerprint(ev: SessionEvent, body: Optional[str]) -> str:
|
||||
"""Stable content fingerprint, independent of seq/payload_ref, for dedup."""
|
||||
h = hashlib.sha1()
|
||||
parts = [ev.ts or "", ev.kind, ev.role or "", ev.tool or "", ev.summary or "",
|
||||
ev.role or "", str(ev.is_sidechain)]
|
||||
h.update("\x1f".join(parts).encode("utf-8"))
|
||||
if body is not None:
|
||||
h.update(b"\x1e")
|
||||
h.update(body.encode("utf-8"))
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
class Store:
|
||||
def __init__(self, db_path: str, blob_dir: str):
|
||||
self.db_path = db_path
|
||||
@@ -121,14 +134,75 @@ class Store:
|
||||
self.db.commit()
|
||||
return total
|
||||
|
||||
def ingest(self, bundle) -> None:
|
||||
"""Persist a full Normalized bundle (session + events + blobs)."""
|
||||
def ingest(self, bundle) -> int:
|
||||
"""Persist a Normalized bundle, merging into any existing session.
|
||||
|
||||
Multiple files can map to one ``session_uid`` (Claude resume/sidechains;
|
||||
Grok multi-file dirs). Events are de-duplicated by content fingerprint and
|
||||
genuinely-new events are appended with offset ``seq`` (design OQ6 / T03).
|
||||
Returns the number of new events written. Idempotent: re-ingesting the
|
||||
same bundle adds nothing.
|
||||
"""
|
||||
s = bundle.session
|
||||
if s.ingested_at is None:
|
||||
s.ingested_at = _now()
|
||||
self.upsert_session(s)
|
||||
self.upsert_events(bundle.events)
|
||||
self.write_blobs(s.session_uid, bundle.blobs)
|
||||
existing = self.get_session(s.session_uid)
|
||||
if existing is None:
|
||||
if s.ingested_at is None:
|
||||
s.ingested_at = _now()
|
||||
self.upsert_session(s)
|
||||
# known fingerprints + current max seq for this session
|
||||
seen = self._event_fingerprints(s.session_uid)
|
||||
next_seq = self._max_seq(s.session_uid) + 1
|
||||
|
||||
new_events: list[SessionEvent] = []
|
||||
new_blobs: dict[str, str] = {}
|
||||
old_to_new: dict[int, int] = {}
|
||||
for ev in bundle.events:
|
||||
body = bundle.blobs.get(ev.payload_ref) if ev.payload_ref else None
|
||||
fp = _fingerprint(ev, body)
|
||||
if fp in seen:
|
||||
continue # already stored (prior file or prior sweep)
|
||||
new_seq = next_seq
|
||||
next_seq += 1
|
||||
old_to_new[ev.seq] = new_seq
|
||||
# remap parent within this bundle; cross-file parents become None
|
||||
parent = old_to_new.get(ev.parent_seq) if ev.parent_seq is not None else None
|
||||
ref = None
|
||||
if body is not None:
|
||||
ref = f"blob://{s.session_uid}/{new_seq}"
|
||||
new_blobs[ref] = body
|
||||
merged = SessionEvent(
|
||||
session_uid=s.session_uid, seq=new_seq, parent_seq=parent, ts=ev.ts,
|
||||
kind=ev.kind, role=ev.role, tool=ev.tool, summary=ev.summary,
|
||||
payload_ref=ref, tokens=ev.tokens, is_sidechain=ev.is_sidechain,
|
||||
)
|
||||
new_events.append(merged)
|
||||
seen.add(fp)
|
||||
|
||||
if new_events:
|
||||
self.upsert_events(new_events)
|
||||
self.write_blobs(s.session_uid, new_blobs)
|
||||
return len(new_events)
|
||||
|
||||
def _max_seq(self, session_uid: str) -> int:
|
||||
row = self.db.execute(
|
||||
"SELECT COALESCE(MAX(seq), -1) m FROM events WHERE session_uid=?", (session_uid,)
|
||||
).fetchone()
|
||||
return int(row["m"])
|
||||
|
||||
def _event_fingerprints(self, session_uid: str) -> set[str]:
|
||||
fps: set[str] = set()
|
||||
for e in self.get_events(session_uid):
|
||||
body = None
|
||||
if e.payload_ref:
|
||||
r = self.db.execute("SELECT path FROM blobs WHERE ref=?", (e.payload_ref,)).fetchone()
|
||||
if r:
|
||||
try:
|
||||
with open(r["path"], "r", encoding="utf-8") as f:
|
||||
body = f.read()
|
||||
except OSError:
|
||||
body = None
|
||||
fps.add(_fingerprint(e, body))
|
||||
return fps
|
||||
|
||||
# ---- Tier 2 (digest) ---------------------------------------------------
|
||||
|
||||
|
||||
Reference in New Issue
Block a user