Files
state-hub/api/services/token_sources/claude.py

121 lines
4.2 KiB
Python

from __future__ import annotations
import json
from datetime import datetime
from pathlib import Path
from typing import Any
from api.services.token_sources.base import TokenSourceRecord, parse_iso
PARSER_VERSION = "claude-transcript-v1"
def iter_claude_transcript_files(claude_home: Path) -> list[Path]:
projects = claude_home / "projects"
if not projects.is_dir():
return []
return sorted(projects.glob("**/*.jsonl"))
def _usage_from_entry(entry: dict[str, Any]) -> dict[str, Any]:
message = entry.get("message")
if isinstance(message, dict) and isinstance(message.get("usage"), dict):
return message["usage"]
usage = entry.get("usage")
return usage if isinstance(usage, dict) else {}
def parse_claude_transcript(path: Path, since: datetime) -> TokenSourceRecord | None:
session_id = path.stem
cwd: str | None = None
model: str | None = None
first_at: datetime | None = None
last_at: datetime | None = None
tokens_in = tokens_out = 0
cached_input_tokens = 0
raw_total_tokens = 0
usage_records = 0
malformed_lines = 0
try:
handle = path.open("r", encoding="utf-8", errors="ignore")
except OSError:
return None
with handle:
for line in handle:
try:
entry: dict[str, Any] = json.loads(line)
except json.JSONDecodeError:
malformed_lines += 1
continue
ts = entry.get("timestamp") or entry.get("created_at")
parsed_ts = parse_iso(ts) if isinstance(ts, str) else None
if parsed_ts:
first_at = first_at or parsed_ts
last_at = parsed_ts
session_id = str(entry.get("session_id") or entry.get("conversation_id") or session_id)
cwd = entry.get("cwd") or entry.get("project_cwd") or cwd
model = entry.get("model") or model
message = entry.get("message")
if isinstance(message, dict):
model = message.get("model") or model
usage = _usage_from_entry(entry)
if not usage:
continue
if parsed_ts is not None and parsed_ts < since:
continue
input_tokens = int(usage.get("input_tokens") or 0)
cache_creation = int(usage.get("cache_creation_input_tokens") or 0)
cache_read = int(usage.get("cache_read_input_tokens") or 0)
output_tokens = int(usage.get("output_tokens") or 0)
if input_tokens == 0 and output_tokens == 0 and cache_creation == 0 and cache_read == 0:
continue
tokens_in += input_tokens
tokens_out += output_tokens
cached_input_tokens += cache_creation + cache_read
raw_total_tokens += input_tokens + cache_creation + cache_read + output_tokens
usage_records += 1
if usage_records == 0 or tokens_in + tokens_out + cached_input_tokens == 0:
return None
return TokenSourceRecord(
source_provider="claude_transcript",
source_id=f"claude:{session_id}",
source_path=path,
source_created_at=last_at,
session_id=session_id,
cwd=cwd,
model=model,
agent="claude",
tokens_in=tokens_in,
tokens_out=tokens_out,
cached_input_tokens=cached_input_tokens,
raw_total_tokens=raw_total_tokens or None,
parser_version=PARSER_VERSION,
confidence=1.0,
raw_metadata={
"started_at": first_at.isoformat() if first_at else None,
"usage_records": usage_records,
"malformed_lines": malformed_lines,
"source_file_name": path.name,
},
)
def collect_claude_transcripts(claude_home: Path, since: datetime) -> list[TokenSourceRecord]:
by_id: dict[str, TokenSourceRecord] = {}
for path in iter_claude_transcript_files(claude_home):
parsed = parse_claude_transcript(path, since)
if parsed is None:
continue
current = by_id.get(parsed.source_id)
if current is None or parsed.tokens_total > current.tokens_total:
by_id[parsed.source_id] = parsed
return sorted(by_id.values(), key=lambda item: item.source_created_at or datetime.min.replace(tzinfo=since.tzinfo))