generated from coulomb/repo-seed
session-memory Phase 0: ingest cursor + sweep entrypoint + config (T06)
- session_memory/core/cursor.py: size/mtime change detection sidecar - session_memory/config.toml: store paths, retention caps, per-source globs (claude on, codex/grok off for Phase 1), repo->domain map - session_memory/ingest.py: discover->normalize->store->digest->evict; --dry-run creates/writes nothing; python -m session_memory.ingest - tests/test_ingest.py; live dry-run parsed 84/85 real local sessions Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
39
session_memory/config.toml
Normal file
39
session_memory/config.toml
Normal file
@@ -0,0 +1,39 @@
|
||||
# Coding Session Memory — configuration (design §5.1, §8).
|
||||
# Paths support ~ expansion. Edit caps to taste; see docs/DESIGN-session-memory.md.
|
||||
|
||||
[store]
|
||||
# Local store lives under the repo by default (gitignored).
|
||||
db_path = "session_memory/.store/mem.db"
|
||||
blob_dir = "session_memory/.store/blobs"
|
||||
cursor = "session_memory/.store/cursors.json"
|
||||
|
||||
[retention]
|
||||
raw_soft_cap_bytes = 4294967296 # 4 GiB — begin evicting analyzed sessions above this
|
||||
raw_hard_cap_bytes = 6442450944 # 6 GiB — absolute Tier 1 ceiling
|
||||
raw_max_age_days = 45 # backstop: analyzed raw older than this is evictable
|
||||
distilled_cap_bytes = 1073741824 # 1 GiB — Tier 2 ceiling (alert, never auto-drop)
|
||||
cadence = "daily" # sweep trigger: daily | weekly | on-hook
|
||||
|
||||
[sources.claude]
|
||||
enabled = true
|
||||
root = "~/.claude/projects"
|
||||
# glob, relative to root; covers sessions and agent-* sidechains
|
||||
glob = "*/*.jsonl"
|
||||
|
||||
# Codex / Grok adapters land in Phase 1 (schemas confirmed in the design doc).
|
||||
[sources.codex]
|
||||
enabled = false
|
||||
root = "~/.codex/sessions"
|
||||
glob = "*/*/*/rollout-*.jsonl"
|
||||
|
||||
[sources.grok]
|
||||
enabled = false
|
||||
root = "~/.grok/sessions"
|
||||
glob = "*/*/chat_history.jsonl"
|
||||
|
||||
# cwd basename -> domain slug. Used to tag sessions with their Custodian domain.
|
||||
[repo_domain_map]
|
||||
agentic-resources = "helix_forge"
|
||||
the-custodian = "custodian"
|
||||
state-hub = "custodian"
|
||||
ops-bridge = "custodian"
|
||||
49
session_memory/core/cursor.py
Normal file
49
session_memory/core/cursor.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""Per-source ingest cursors (design §6; T06).
|
||||
|
||||
Tracks ``(path -> size, mtime)`` so sweeps re-ingest only changed/grown files.
|
||||
Persisted as a small JSON sidecar. Ingest itself is idempotent on
|
||||
``(session_uid, seq)`` in the store, so the cursor is an optimization, not a
|
||||
correctness requirement — a lost cursor just means a full (still-idempotent)
|
||||
re-scan.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class Cursors:
|
||||
def __init__(self, path: str):
|
||||
self.path = path
|
||||
self._data: dict[str, dict] = {}
|
||||
if os.path.exists(path):
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
self._data = json.load(f)
|
||||
except (OSError, ValueError):
|
||||
self._data = {}
|
||||
|
||||
def is_changed(self, file_path: str) -> bool:
|
||||
"""True if the file is new or has changed size/mtime since last seen."""
|
||||
try:
|
||||
stat = os.stat(file_path)
|
||||
except OSError:
|
||||
return False
|
||||
prev = self._data.get(file_path)
|
||||
return prev is None or prev.get("size") != stat.st_size or prev.get("mtime") != stat.st_mtime
|
||||
|
||||
def mark(self, file_path: str) -> None:
|
||||
try:
|
||||
stat = os.stat(file_path)
|
||||
except OSError:
|
||||
return
|
||||
self._data[file_path] = {"size": stat.st_size, "mtime": stat.st_mtime}
|
||||
|
||||
def save(self) -> None:
|
||||
os.makedirs(os.path.dirname(self.path) or ".", exist_ok=True)
|
||||
tmp = self.path + ".tmp"
|
||||
with open(tmp, "w", encoding="utf-8") as f:
|
||||
json.dump(self._data, f)
|
||||
os.replace(tmp, self.path)
|
||||
128
session_memory/ingest.py
Normal file
128
session_memory/ingest.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""Session-memory sweep entrypoint (design §7; T06).
|
||||
|
||||
One sweep: discover (per enabled source) -> normalize (adapter) -> store ->
|
||||
digest -> retention-evict. Idempotent and re-runnable; intended to be triggered
|
||||
on the configured cadence (``/schedule`` daily/weekly) or by an agent hook.
|
||||
|
||||
Usage:
|
||||
python -m session_memory.ingest [--config PATH] [--once] [--dry-run]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
import tomllib
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
from .adapters import claude as claude_adapter
|
||||
from .core import digest as digest_mod
|
||||
from .core.cursor import Cursors
|
||||
from .core.retention import RetentionConfig, sweep as retention_sweep
|
||||
from .core.store import Store
|
||||
|
||||
# adapter dispatch by source name
|
||||
_ADAPTERS = {"claude": claude_adapter.parse_session}
|
||||
|
||||
|
||||
@dataclass
|
||||
class SweepResult:
|
||||
discovered: int = 0
|
||||
ingested: int = 0
|
||||
skipped_unchanged: int = 0
|
||||
analyzed: int = 0
|
||||
warnings: list[str] = field(default_factory=list)
|
||||
retention: Any = None
|
||||
|
||||
|
||||
def _expand(p: str) -> str:
|
||||
return os.path.expanduser(p)
|
||||
|
||||
|
||||
def load_config(path: str) -> dict[str, Any]:
|
||||
with open(path, "rb") as f:
|
||||
return tomllib.load(f)
|
||||
|
||||
|
||||
def run_sweep(config: dict[str, Any], *, dry_run: bool = False) -> SweepResult:
|
||||
store_cfg = config.get("store", {})
|
||||
ret_cfg = config.get("retention", {})
|
||||
repo_map = config.get("repo_domain_map", {})
|
||||
res = SweepResult()
|
||||
|
||||
# In dry-run we only discover + parse: no store is created or written.
|
||||
store = None if dry_run else Store(_expand(store_cfg["db_path"]), _expand(store_cfg["blob_dir"]))
|
||||
cursors = Cursors(_expand(store_cfg["cursor"]))
|
||||
|
||||
for name, src in config.get("sources", {}).items():
|
||||
if not src.get("enabled"):
|
||||
continue
|
||||
parse = _ADAPTERS.get(name)
|
||||
if parse is None:
|
||||
res.warnings.append(f"no adapter for source {name!r} (Phase 1)")
|
||||
continue
|
||||
root = _expand(src["root"])
|
||||
for fp in sorted(glob.glob(os.path.join(root, src["glob"]))):
|
||||
res.discovered += 1
|
||||
if not cursors.is_changed(fp):
|
||||
res.skipped_unchanged += 1
|
||||
continue
|
||||
try:
|
||||
bundle = parse(fp, repo_map)
|
||||
except Exception as e: # one bad file must not abort the sweep
|
||||
res.warnings.append(f"parse failed {fp}: {e}")
|
||||
continue
|
||||
if bundle is None:
|
||||
cursors.mark(fp)
|
||||
continue
|
||||
if not dry_run:
|
||||
store.ingest(bundle)
|
||||
digest_mod.analyze(store, bundle.session.session_uid)
|
||||
res.analyzed += 1
|
||||
res.ingested += 1
|
||||
cursors.mark(fp)
|
||||
|
||||
if not dry_run and store is not None:
|
||||
cursors.save()
|
||||
rc = RetentionConfig(
|
||||
raw_soft_cap_bytes=int(ret_cfg.get("raw_soft_cap_bytes", RetentionConfig.raw_soft_cap_bytes)),
|
||||
raw_hard_cap_bytes=int(ret_cfg.get("raw_hard_cap_bytes", RetentionConfig.raw_hard_cap_bytes)),
|
||||
raw_max_age_days=int(ret_cfg.get("raw_max_age_days", RetentionConfig.raw_max_age_days)),
|
||||
distilled_cap_bytes=int(ret_cfg.get("distilled_cap_bytes", RetentionConfig.distilled_cap_bytes)),
|
||||
)
|
||||
res.retention = retention_sweep(store, rc, analyze_fn=digest_mod.analyze)
|
||||
res.warnings.extend(res.retention.warnings)
|
||||
|
||||
if store is not None:
|
||||
store.close()
|
||||
return res
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
here = os.path.dirname(os.path.abspath(__file__))
|
||||
ap = argparse.ArgumentParser(description="Run one coding-session-memory sweep.")
|
||||
ap.add_argument("--config", default=os.path.join(here, "config.toml"))
|
||||
ap.add_argument("--dry-run", action="store_true", help="discover + parse, but do not write or evict")
|
||||
ap.add_argument("--once", action="store_true", help="(default) run a single sweep")
|
||||
args = ap.parse_args(argv)
|
||||
|
||||
config = load_config(args.config)
|
||||
res = run_sweep(config, dry_run=args.dry_run)
|
||||
|
||||
print(f"discovered={res.discovered} ingested={res.ingested} "
|
||||
f"skipped_unchanged={res.skipped_unchanged} analyzed={res.analyzed}")
|
||||
if res.retention is not None:
|
||||
r = res.retention
|
||||
print(f"retention: freed={r.bytes_freed}B final_usage={r.final_usage_bytes}B "
|
||||
f"backstop={len(r.backstop_evicted)} budget={len(r.budget_evicted)} "
|
||||
f"overflow_analyzed={len(r.overflow_analyzed)} data_loss={len(r.overflow_data_loss)}")
|
||||
for w in res.warnings:
|
||||
print(f" WARN: {w}", file=sys.stderr)
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user