generated from coulomb/repo-seed
session-memory Phase 0: normalized schema (T01) + Claude adapter (T02)
- session_memory/core/schema.py: Session/SessionEvent/Cost dataclasses, flavor-prefixed uids, watermarks, kind/outcome validation (T01) - session_memory/adapters/claude.py: JSONL -> Normalized bundle, turn DAG via uuid/parentUuid, kind mapping, cost from message.usage (T02) - tests: schema round-trip + adapter (synthetic + real local session) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
7
session_memory/__init__.py
Normal file
7
session_memory/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""Coding Session Memory — Helix Forge capture + retention layer.
|
||||
|
||||
See docs/DESIGN-session-memory.md. Importable package name uses an underscore
|
||||
(``session_memory``) where the design doc writes ``session-memory/``.
|
||||
"""
|
||||
|
||||
__all__ = ["core", "adapters"]
|
||||
1
session_memory/adapters/__init__.py
Normal file
1
session_memory/adapters/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Per-flavor collector adapters (Tier 0 -> Tier 1 normalization)."""
|
||||
228
session_memory/adapters/claude.py
Normal file
228
session_memory/adapters/claude.py
Normal file
@@ -0,0 +1,228 @@
|
||||
"""Claude Code collector adapter — Tier 0 -> Tier 1 (design §2.1, §4.3).
|
||||
|
||||
Reads ``~/.claude/projects/<url-encoded-cwd>/<session-uuid>.jsonl`` (and
|
||||
``agent-*.jsonl`` sidechains), discriminates on the record ``type``, reconstructs
|
||||
the turn DAG via ``uuid``/``parentUuid``, and emits normalized records.
|
||||
|
||||
Returns a :class:`Normalized` bundle: the ``Session``, its ordered
|
||||
``SessionEvent`` list, and a ``blobs`` map (``payload_ref -> full text body``)
|
||||
that the store persists out-of-line so Tier 1 rows stay light.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
from ..core.schema import Cost, Session, SessionEvent
|
||||
|
||||
FLAVOR = "claude"
|
||||
|
||||
# tool_use names that mutate files -> kind "edit"
|
||||
_EDIT_TOOLS = {"Edit", "Write", "NotebookEdit", "MultiEdit"}
|
||||
# crude test-runner detection inside Bash commands -> kind "test_run"
|
||||
_TEST_HINTS = ("pytest", "unittest", "npm test", "npm run test", "go test", "cargo test", "jest", "vitest")
|
||||
|
||||
|
||||
@dataclass
|
||||
class Normalized:
|
||||
session: Session
|
||||
events: list[SessionEvent]
|
||||
blobs: dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
def _iter_records(path: str) -> Iterable[dict[str, Any]]:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
yield json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue # tolerate partial/corrupt trailing lines
|
||||
|
||||
|
||||
def _resolve_repo(cwd: Optional[str], repo_domain_map: dict[str, str]) -> tuple[Optional[str], Optional[str]]:
|
||||
"""cwd -> (repo, domain). repo is the cwd basename; domain via map."""
|
||||
if not cwd:
|
||||
return None, None
|
||||
repo = os.path.basename(cwd.rstrip("/")) or None
|
||||
domain = repo_domain_map.get(repo) if repo else None
|
||||
return repo, domain
|
||||
|
||||
|
||||
def _is_test_command(text: str) -> bool:
|
||||
low = text.lower()
|
||||
return any(h in low for h in _TEST_HINTS)
|
||||
|
||||
|
||||
def _content_blocks(message: dict[str, Any]) -> list[dict[str, Any]]:
|
||||
content = message.get("content")
|
||||
if isinstance(content, str):
|
||||
return [{"type": "text", "text": content}]
|
||||
if isinstance(content, list):
|
||||
return [b for b in content if isinstance(b, dict)]
|
||||
return []
|
||||
|
||||
|
||||
def parse_session(path: str, repo_domain_map: Optional[dict[str, str]] = None) -> Optional[Normalized]:
|
||||
"""Parse one Claude transcript file into a Normalized bundle.
|
||||
|
||||
Returns None if the file has no usable session records.
|
||||
"""
|
||||
repo_domain_map = repo_domain_map or {}
|
||||
records = list(_iter_records(path))
|
||||
if not records:
|
||||
return None
|
||||
|
||||
session_id: Optional[str] = None
|
||||
cwd = git_branch = version = model = None
|
||||
timestamps: list[str] = []
|
||||
file_is_sidechain = os.path.basename(path).startswith("agent-")
|
||||
|
||||
events: list[SessionEvent] = []
|
||||
blobs: dict[str, str] = {}
|
||||
uuid_to_seq: dict[str, int] = {}
|
||||
cost = Cost()
|
||||
seq = 0
|
||||
|
||||
def add_event(uuid: Optional[str], parent_uuid: Optional[str], ts, kind, *,
|
||||
role=None, tool=None, summary=None, body=None, tokens=0, sidechain=False):
|
||||
nonlocal seq
|
||||
s = seq
|
||||
seq += 1
|
||||
if uuid:
|
||||
uuid_to_seq[uuid] = s
|
||||
parent_seq = uuid_to_seq.get(parent_uuid) if parent_uuid else None
|
||||
payload_ref = None
|
||||
if body:
|
||||
payload_ref = f"blob://{session_id}/{s}"
|
||||
blobs[payload_ref] = body
|
||||
events.append(SessionEvent(
|
||||
session_uid=Session.make_uid(FLAVOR, session_id or "unknown"),
|
||||
seq=s, parent_seq=parent_seq, ts=ts, kind=kind, role=role, tool=tool,
|
||||
summary=(summary or "")[:300] or None, payload_ref=payload_ref,
|
||||
tokens=tokens, is_sidechain=sidechain or file_is_sidechain,
|
||||
))
|
||||
|
||||
for rec in records:
|
||||
rtype = rec.get("type")
|
||||
ts = rec.get("timestamp")
|
||||
if ts:
|
||||
timestamps.append(ts)
|
||||
session_id = session_id or rec.get("sessionId")
|
||||
cwd = cwd or rec.get("cwd")
|
||||
git_branch = git_branch or rec.get("gitBranch")
|
||||
version = version or rec.get("version")
|
||||
uuid = rec.get("uuid")
|
||||
parent = rec.get("parentUuid")
|
||||
sidechain = bool(rec.get("isSidechain"))
|
||||
|
||||
if rtype == "user":
|
||||
msg = rec.get("message", {})
|
||||
for b in _content_blocks(msg):
|
||||
bt = b.get("type")
|
||||
if bt == "tool_result":
|
||||
body = _stringify(b.get("content"))
|
||||
add_event(uuid, parent, ts, "tool_result", role="tool",
|
||||
summary="tool result", body=body, sidechain=sidechain)
|
||||
else:
|
||||
text = b.get("text", "")
|
||||
add_event(uuid, parent, ts, "user_msg", role="user",
|
||||
summary=_first_line(text), body=text, sidechain=sidechain)
|
||||
|
||||
elif rtype == "assistant":
|
||||
msg = rec.get("message", {})
|
||||
model = model or msg.get("model")
|
||||
usage = msg.get("usage") or {}
|
||||
cost.input_tokens += int(usage.get("input_tokens", 0) or 0)
|
||||
cost.output_tokens += int(usage.get("output_tokens", 0) or 0)
|
||||
cost.cache_tokens += int(
|
||||
(usage.get("cache_read_input_tokens", 0) or 0)
|
||||
+ (usage.get("cache_creation_input_tokens", 0) or 0)
|
||||
)
|
||||
out_tokens = int(usage.get("output_tokens", 0) or 0)
|
||||
for b in _content_blocks(msg):
|
||||
bt = b.get("type")
|
||||
if bt == "thinking":
|
||||
add_event(uuid, parent, ts, "thinking", role="assistant",
|
||||
summary="thinking", body=b.get("thinking", ""), sidechain=sidechain)
|
||||
elif bt == "text":
|
||||
text = b.get("text", "")
|
||||
add_event(uuid, parent, ts, "assistant_msg", role="assistant",
|
||||
summary=_first_line(text), body=text, tokens=out_tokens, sidechain=sidechain)
|
||||
elif bt == "tool_use":
|
||||
name = b.get("name", "")
|
||||
inp = b.get("input", {})
|
||||
body = _stringify(inp)
|
||||
kind = "tool_call"
|
||||
if name in _EDIT_TOOLS:
|
||||
kind = "edit"
|
||||
elif name == "Bash" and _is_test_command(_stringify(inp.get("command", ""))):
|
||||
kind = "test_run"
|
||||
add_event(uuid, parent, ts, kind, role="assistant", tool=name,
|
||||
summary=f"{name}", body=body, sidechain=sidechain)
|
||||
|
||||
elif rtype == "summary":
|
||||
add_event(uuid, parent, ts, "lifecycle", summary="summary",
|
||||
body=_stringify(rec.get("summary")), sidechain=sidechain)
|
||||
# queue-operation / ai-title / last-prompt / attachment: skipped as events
|
||||
|
||||
if session_id is None:
|
||||
return None
|
||||
|
||||
cost.turns = sum(1 for e in events if e.kind == "user_msg")
|
||||
started = min(timestamps) if timestamps else None
|
||||
ended = max(timestamps) if timestamps else None
|
||||
cost.wall_clock_s = _seconds_between(started, ended)
|
||||
|
||||
repo, domain = _resolve_repo(cwd, repo_domain_map)
|
||||
session = Session(
|
||||
session_uid=Session.make_uid(FLAVOR, session_id),
|
||||
flavor=FLAVOR,
|
||||
native_session_id=session_id,
|
||||
repo=repo, domain=domain, cwd=cwd, git_branch=git_branch,
|
||||
model=model, started_at=started, ended_at=ended,
|
||||
outcome="unknown", # outcome inference happens in the digest step (T04)
|
||||
cost=cost,
|
||||
source_path=path,
|
||||
source_bytes=os.path.getsize(path) if os.path.exists(path) else 0,
|
||||
discovered_at=_now(),
|
||||
)
|
||||
return Normalized(session=session, events=events, blobs=blobs)
|
||||
|
||||
|
||||
# ---- helpers ---------------------------------------------------------------
|
||||
|
||||
def _stringify(v: Any) -> str:
|
||||
if v is None:
|
||||
return ""
|
||||
if isinstance(v, str):
|
||||
return v
|
||||
try:
|
||||
return json.dumps(v, ensure_ascii=False)[:20000]
|
||||
except (TypeError, ValueError):
|
||||
return str(v)[:20000]
|
||||
|
||||
|
||||
def _first_line(text: str) -> str:
|
||||
return (text or "").strip().splitlines()[0] if (text or "").strip() else ""
|
||||
|
||||
|
||||
def _seconds_between(start: Optional[str], end: Optional[str]) -> float:
|
||||
if not start or not end:
|
||||
return 0.0
|
||||
try:
|
||||
a = datetime.fromisoformat(start.replace("Z", "+00:00"))
|
||||
b = datetime.fromisoformat(end.replace("Z", "+00:00"))
|
||||
return max(0.0, (b - a).total_seconds())
|
||||
except ValueError:
|
||||
return 0.0
|
||||
|
||||
|
||||
def _now() -> str:
|
||||
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
1
session_memory/core/__init__.py
Normal file
1
session_memory/core/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Flavor-agnostic core: schema, store, cursor, digest, retention."""
|
||||
156
session_memory/core/schema.py
Normal file
156
session_memory/core/schema.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""Normalized session schema (Tier 1) — design doc §4.
|
||||
|
||||
Two record kinds, ``Session`` and ``SessionEvent``, plus the small enums every
|
||||
adapter targets. Field names here are the stable contract; per-flavor quirks are
|
||||
absorbed inside each adapter (see design §4.3 native -> kind mapping).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import asdict, dataclass, field, fields
|
||||
from typing import Any, Optional
|
||||
|
||||
SCHEMA_VERSION = 1
|
||||
|
||||
# Supported agent flavors. ``session_uid`` is always "<flavor>:<native id>".
|
||||
FLAVORS = ("claude", "codex", "grok")
|
||||
|
||||
# SessionEvent.kind universe (design §4.2 / §4.3).
|
||||
KINDS = (
|
||||
"user_msg",
|
||||
"assistant_msg",
|
||||
"thinking",
|
||||
"tool_call",
|
||||
"tool_result",
|
||||
"error",
|
||||
"test_run",
|
||||
"edit",
|
||||
"retry",
|
||||
"human_intervention",
|
||||
"decision",
|
||||
"lifecycle",
|
||||
"completion",
|
||||
)
|
||||
|
||||
# Session.outcome universe.
|
||||
OUTCOMES = ("success", "fail", "abandoned", "unknown")
|
||||
|
||||
|
||||
@dataclass
|
||||
class Cost:
|
||||
"""Token + effort accounting for a session."""
|
||||
|
||||
input_tokens: int = 0
|
||||
output_tokens: int = 0
|
||||
cache_tokens: int = 0
|
||||
wall_clock_s: float = 0.0
|
||||
turns: int = 0
|
||||
retries: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class Session:
|
||||
"""One bounded run of a coding agent against a repo (design §4.1)."""
|
||||
|
||||
session_uid: str # "<flavor>:<native id>" — globally unique
|
||||
flavor: str
|
||||
native_session_id: str
|
||||
repo: Optional[str] = None
|
||||
domain: Optional[str] = None
|
||||
cwd: Optional[str] = None
|
||||
git_branch: Optional[str] = None
|
||||
model: Optional[str] = None
|
||||
started_at: Optional[str] = None # ISO-8601 UTC
|
||||
ended_at: Optional[str] = None
|
||||
outcome: str = "unknown"
|
||||
cost: Cost = field(default_factory=Cost)
|
||||
task_ref: Optional[str] = None
|
||||
source_path: Optional[str] = None
|
||||
source_bytes: int = 0
|
||||
schema_version: int = SCHEMA_VERSION
|
||||
# watermarks (design §3.1): discovered -> ingested -> analyzed -> evicted
|
||||
discovered_at: Optional[str] = None
|
||||
ingested_at: Optional[str] = None
|
||||
analyzed_at: Optional[str] = None
|
||||
evicted_at: Optional[str] = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.flavor not in FLAVORS:
|
||||
raise ValueError(f"unknown flavor {self.flavor!r}; expected one of {FLAVORS}")
|
||||
if self.outcome not in OUTCOMES:
|
||||
raise ValueError(f"unknown outcome {self.outcome!r}; expected one of {OUTCOMES}")
|
||||
expected_prefix = f"{self.flavor}:"
|
||||
if not self.session_uid.startswith(expected_prefix):
|
||||
raise ValueError(
|
||||
f"session_uid {self.session_uid!r} must start with {expected_prefix!r}"
|
||||
)
|
||||
|
||||
@property
|
||||
def is_evictable(self) -> bool:
|
||||
"""A session may be evicted from Tier 1 only once analyzed (design §3.1)."""
|
||||
return self.analyzed_at is not None and self.evicted_at is None
|
||||
|
||||
@staticmethod
|
||||
def make_uid(flavor: str, native_session_id: str) -> str:
|
||||
return f"{flavor}:{native_session_id}"
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
d = asdict(self)
|
||||
return d
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self.to_dict(), sort_keys=True)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> "Session":
|
||||
d = dict(d)
|
||||
cost = d.pop("cost", None)
|
||||
obj = cls(**{k: v for k, v in d.items() if k in _SESSION_FIELDS})
|
||||
if cost is not None:
|
||||
obj.cost = Cost(**{k: v for k, v in cost.items() if k in _COST_FIELDS})
|
||||
return obj
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, s: str) -> "Session":
|
||||
return cls.from_dict(json.loads(s))
|
||||
|
||||
|
||||
@dataclass
|
||||
class SessionEvent:
|
||||
"""One atomic record within a session (design §4.2)."""
|
||||
|
||||
session_uid: str
|
||||
seq: int # monotonic within session
|
||||
ts: Optional[str] = None
|
||||
kind: str = "lifecycle"
|
||||
parent_seq: Optional[int] = None # turn DAG (Claude); None for flat flavors
|
||||
role: Optional[str] = None # user|assistant|system|tool
|
||||
tool: Optional[str] = None # when kind in {tool_call, tool_result}
|
||||
summary: Optional[str] = None # short, human-readable
|
||||
payload_ref: Optional[str] = None # pointer to full body in Tier 1 blob store
|
||||
tokens: int = 0
|
||||
is_sidechain: bool = False
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.kind not in KINDS:
|
||||
raise ValueError(f"unknown kind {self.kind!r}; expected one of {KINDS}")
|
||||
|
||||
def to_dict(self) -> dict[str, Any]:
|
||||
return asdict(self)
|
||||
|
||||
def to_json(self) -> str:
|
||||
return json.dumps(self.to_dict(), sort_keys=True)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, d: dict[str, Any]) -> "SessionEvent":
|
||||
return cls(**{k: v for k, v in d.items() if k in _EVENT_FIELDS})
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, s: str) -> "SessionEvent":
|
||||
return cls.from_dict(json.loads(s))
|
||||
|
||||
|
||||
_SESSION_FIELDS = {f.name for f in fields(Session)}
|
||||
_COST_FIELDS = {f.name for f in fields(Cost)}
|
||||
_EVENT_FIELDS = {f.name for f in fields(SessionEvent)}
|
||||
Reference in New Issue
Block a user