from __future__ import annotations import json from datetime import datetime from pathlib import Path from typing import Any from api.services.token_sources.base import TokenSourceRecord, parse_iso PARSER_VERSION = "claude-transcript-v1" def iter_claude_transcript_files(claude_home: Path) -> list[Path]: projects = claude_home / "projects" if not projects.is_dir(): return [] return sorted(projects.glob("**/*.jsonl")) def _usage_from_entry(entry: dict[str, Any]) -> dict[str, Any]: message = entry.get("message") if isinstance(message, dict) and isinstance(message.get("usage"), dict): return message["usage"] usage = entry.get("usage") return usage if isinstance(usage, dict) else {} def parse_claude_transcript(path: Path, since: datetime) -> TokenSourceRecord | None: session_id = path.stem cwd: str | None = None model: str | None = None first_at: datetime | None = None last_at: datetime | None = None tokens_in = tokens_out = 0 cached_input_tokens = 0 raw_total_tokens = 0 usage_records = 0 malformed_lines = 0 try: handle = path.open("r", encoding="utf-8", errors="ignore") except OSError: return None with handle: for line in handle: try: entry: dict[str, Any] = json.loads(line) except json.JSONDecodeError: malformed_lines += 1 continue ts = entry.get("timestamp") or entry.get("created_at") parsed_ts = parse_iso(ts) if isinstance(ts, str) else None if parsed_ts: first_at = first_at or parsed_ts last_at = parsed_ts session_id = str(entry.get("session_id") or entry.get("conversation_id") or session_id) cwd = entry.get("cwd") or entry.get("project_cwd") or cwd model = entry.get("model") or model message = entry.get("message") if isinstance(message, dict): model = message.get("model") or model usage = _usage_from_entry(entry) if not usage: continue if parsed_ts is not None and parsed_ts < since: continue input_tokens = int(usage.get("input_tokens") or 0) cache_creation = int(usage.get("cache_creation_input_tokens") or 0) cache_read = int(usage.get("cache_read_input_tokens") or 0) output_tokens = int(usage.get("output_tokens") or 0) if input_tokens == 0 and output_tokens == 0 and cache_creation == 0 and cache_read == 0: continue tokens_in += input_tokens tokens_out += output_tokens cached_input_tokens += cache_creation + cache_read raw_total_tokens += input_tokens + cache_creation + cache_read + output_tokens usage_records += 1 if usage_records == 0 or tokens_in + tokens_out + cached_input_tokens == 0: return None return TokenSourceRecord( source_provider="claude_transcript", source_id=f"claude:{session_id}", source_path=path, source_created_at=last_at, session_id=session_id, cwd=cwd, model=model, agent="claude", tokens_in=tokens_in, tokens_out=tokens_out, cached_input_tokens=cached_input_tokens, raw_total_tokens=raw_total_tokens or None, parser_version=PARSER_VERSION, confidence=1.0, raw_metadata={ "started_at": first_at.isoformat() if first_at else None, "usage_records": usage_records, "malformed_lines": malformed_lines, "source_file_name": path.name, }, ) def collect_claude_transcripts(claude_home: Path, since: datetime) -> list[TokenSourceRecord]: by_id: dict[str, TokenSourceRecord] = {} for path in iter_claude_transcript_files(claude_home): parsed = parse_claude_transcript(path, since) if parsed is None: continue current = by_id.get(parsed.source_id) if current is None or parsed.tokens_total > current.tokens_total: by_id[parsed.source_id] = parsed return sorted(by_id.values(), key=lambda item: item.source_created_at or datetime.min.replace(tzinfo=since.tzinfo))