generated from coulomb/repo-seed
Fixed and improved token tracking
This commit is contained in:
16
api/services/token_sources/__init__.py
Normal file
16
api/services/token_sources/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
"""Token source adapters for measured agent usage."""
|
||||
|
||||
from api.services.token_sources.base import TokenSourceRecord, parse_iso
|
||||
from api.services.token_sources.codex import collect_codex_sessions, iter_codex_session_files, parse_codex_session
|
||||
from api.services.token_sources.claude import collect_claude_transcripts, iter_claude_transcript_files, parse_claude_transcript
|
||||
|
||||
__all__ = [
|
||||
"TokenSourceRecord",
|
||||
"parse_iso",
|
||||
"collect_codex_sessions",
|
||||
"iter_codex_session_files",
|
||||
"parse_codex_session",
|
||||
"collect_claude_transcripts",
|
||||
"iter_claude_transcript_files",
|
||||
"parse_claude_transcript",
|
||||
]
|
||||
171
api/services/token_sources/attribution.py
Normal file
171
api/services/token_sources/attribution.py
Normal file
@@ -0,0 +1,171 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RepoRef:
|
||||
repo_id: str
|
||||
slug: str
|
||||
local_path: str | None = None
|
||||
host_paths: dict[str, Any] | None = None
|
||||
remote_url: str | None = None
|
||||
git_fingerprint: str | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RepoMatch:
|
||||
repo_id: str
|
||||
slug: str
|
||||
method: str
|
||||
confidence: float
|
||||
|
||||
|
||||
def normalise_cwd(raw: str | None) -> str | None:
|
||||
if not raw:
|
||||
return None
|
||||
value = raw.replace("\\", "/")
|
||||
prefixes = (
|
||||
"//wsl.localhost/Ubuntu-24.04",
|
||||
"//wsl$/Ubuntu-24.04",
|
||||
)
|
||||
for prefix in prefixes:
|
||||
if value.startswith(prefix):
|
||||
return value[len(prefix):] or "/"
|
||||
if len(value) >= 3 and value[1:3] == ":/":
|
||||
drive = value[0].lower()
|
||||
return f"/mnt/{drive}{value[2:]}"
|
||||
return value
|
||||
|
||||
|
||||
def normalise_remote_url(raw: str | None) -> str | None:
|
||||
if not raw:
|
||||
return None
|
||||
value = raw.strip()
|
||||
if value.endswith(".git"):
|
||||
value = value[:-4]
|
||||
if value.startswith("git@") and ":" in value:
|
||||
host, path = value[4:].split(":", 1)
|
||||
value = f"ssh://{host}/{path}"
|
||||
return value.lower().rstrip("/")
|
||||
|
||||
|
||||
def repo_refs_from_api(repos: list[dict[str, Any]]) -> list[RepoRef]:
|
||||
refs = []
|
||||
for repo in repos:
|
||||
repo_id = repo.get("id")
|
||||
slug = repo.get("slug")
|
||||
if not repo_id or not slug:
|
||||
continue
|
||||
refs.append(
|
||||
RepoRef(
|
||||
repo_id=str(repo_id),
|
||||
slug=str(slug),
|
||||
local_path=repo.get("local_path"),
|
||||
host_paths=repo.get("host_paths") if isinstance(repo.get("host_paths"), dict) else {},
|
||||
remote_url=repo.get("remote_url"),
|
||||
git_fingerprint=repo.get("git_fingerprint"),
|
||||
)
|
||||
)
|
||||
return refs
|
||||
|
||||
|
||||
def _git(cwd: str, *args: str) -> str | None:
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["git", *args],
|
||||
cwd=cwd,
|
||||
check=False,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=5,
|
||||
)
|
||||
except (OSError, subprocess.SubprocessError):
|
||||
return None
|
||||
if result.returncode != 0:
|
||||
return None
|
||||
value = result.stdout.strip().splitlines()
|
||||
return value[0] if value else None
|
||||
|
||||
|
||||
def git_fingerprint_for_path(cwd: str | None) -> str | None:
|
||||
path = normalise_cwd(cwd)
|
||||
if not path or not Path(path).exists():
|
||||
return None
|
||||
root = _git(path, "rev-parse", "--show-toplevel")
|
||||
if not root:
|
||||
return None
|
||||
return _git(root, "rev-list", "--max-parents=0", "HEAD")
|
||||
|
||||
|
||||
def git_remote_for_path(cwd: str | None) -> str | None:
|
||||
path = normalise_cwd(cwd)
|
||||
if not path or not Path(path).exists():
|
||||
return None
|
||||
root = _git(path, "rev-parse", "--show-toplevel")
|
||||
if not root:
|
||||
return None
|
||||
return _git(root, "remote", "get-url", "origin")
|
||||
|
||||
|
||||
def _repo_paths(repo: RepoRef) -> list[str]:
|
||||
paths = [repo.local_path]
|
||||
if repo.host_paths:
|
||||
paths.extend(str(v) for v in repo.host_paths.values() if v)
|
||||
result = []
|
||||
for raw in paths:
|
||||
path = normalise_cwd(str(raw)) if raw and raw != "(unknown)" else None
|
||||
if path:
|
||||
result.append(path.rstrip("/"))
|
||||
return result
|
||||
|
||||
|
||||
def resolve_repo(cwd: str | None, repos: list[RepoRef]) -> RepoMatch | None:
|
||||
path = normalise_cwd(cwd)
|
||||
fingerprint = git_fingerprint_for_path(path)
|
||||
remote = normalise_remote_url(git_remote_for_path(path))
|
||||
|
||||
if fingerprint:
|
||||
candidates = [repo for repo in repos if repo.git_fingerprint == fingerprint]
|
||||
if len(candidates) == 1:
|
||||
repo = candidates[0]
|
||||
return RepoMatch(repo.repo_id, repo.slug, "git_fingerprint", 0.98)
|
||||
if remote:
|
||||
remote_candidates = [
|
||||
repo for repo in candidates
|
||||
if normalise_remote_url(repo.remote_url) == remote
|
||||
]
|
||||
if len(remote_candidates) == 1:
|
||||
repo = remote_candidates[0]
|
||||
return RepoMatch(repo.repo_id, repo.slug, "git_fingerprint_remote", 0.99)
|
||||
|
||||
if remote:
|
||||
candidates = [repo for repo in repos if normalise_remote_url(repo.remote_url) == remote]
|
||||
if len(candidates) == 1:
|
||||
repo = candidates[0]
|
||||
return RepoMatch(repo.repo_id, repo.slug, "remote_url", 0.90)
|
||||
|
||||
if not path:
|
||||
return None
|
||||
|
||||
path_matches: list[tuple[str, RepoRef]] = []
|
||||
for repo in repos:
|
||||
for repo_path in _repo_paths(repo):
|
||||
if path == repo_path or path.startswith(f"{repo_path}/"):
|
||||
path_matches.append((repo_path, repo))
|
||||
if not path_matches:
|
||||
return None
|
||||
path_matches.sort(key=lambda item: len(item[0]), reverse=True)
|
||||
exact = [item for item in path_matches if path == item[0]]
|
||||
if exact:
|
||||
basename = Path(path).name
|
||||
for _, repo in exact:
|
||||
if repo.slug == basename:
|
||||
return RepoMatch(repo.repo_id, repo.slug, "path_exact_slug", 0.85)
|
||||
repo = exact[0][1]
|
||||
return RepoMatch(repo.repo_id, repo.slug, "path_exact", 0.80)
|
||||
repo = path_matches[0][1]
|
||||
return RepoMatch(repo.repo_id, repo.slug, "path_prefix", 0.75)
|
||||
71
api/services/token_sources/base.py
Normal file
71
api/services/token_sources/base.py
Normal file
@@ -0,0 +1,71 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
def parse_iso(value: str) -> datetime:
|
||||
raw = value.strip()
|
||||
if raw.endswith("Z"):
|
||||
raw = raw[:-1] + "+00:00"
|
||||
if "T" not in raw:
|
||||
raw = f"{raw}T00:00:00+00:00"
|
||||
parsed = datetime.fromisoformat(raw)
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=timezone.utc)
|
||||
return parsed.astimezone(timezone.utc)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TokenSourceRecord:
|
||||
source_provider: str
|
||||
source_id: str
|
||||
source_path: Path
|
||||
source_created_at: datetime | None
|
||||
session_id: str | None = None
|
||||
cwd: str | None = None
|
||||
model: str | None = None
|
||||
agent: str | None = None
|
||||
tokens_in: int = 0
|
||||
tokens_out: int = 0
|
||||
cached_input_tokens: int = 0
|
||||
reasoning_output_tokens: int = 0
|
||||
raw_total_tokens: int | None = None
|
||||
parser_version: str | None = None
|
||||
confidence: float = 1.0
|
||||
raw_metadata: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def tokens_total(self) -> int:
|
||||
return self.tokens_in + self.tokens_out
|
||||
|
||||
def to_token_event_payload(self, repo_id: str | None = None) -> dict[str, Any]:
|
||||
raw_total = self.raw_total_tokens
|
||||
if raw_total is None:
|
||||
raw_total = self.tokens_in + self.tokens_out
|
||||
created_at = self.source_created_at.isoformat() if self.source_created_at else None
|
||||
return {
|
||||
"tokens_in": self.tokens_in,
|
||||
"tokens_out": self.tokens_out,
|
||||
"repo_id": repo_id,
|
||||
"session_id": self.session_id,
|
||||
"model": self.model,
|
||||
"agent": self.agent,
|
||||
"ref_type": "session",
|
||||
"ref_id": self.source_id,
|
||||
"note": f"measured:{self.source_provider}",
|
||||
"created_at": created_at,
|
||||
"measurement_kind": "measured",
|
||||
"source_provider": self.source_provider,
|
||||
"source_id": self.source_id,
|
||||
"source_path": str(self.source_path),
|
||||
"source_created_at": created_at,
|
||||
"parser_version": self.parser_version,
|
||||
"confidence": self.confidence,
|
||||
"cached_input_tokens": self.cached_input_tokens,
|
||||
"reasoning_output_tokens": self.reasoning_output_tokens,
|
||||
"raw_total_tokens": raw_total,
|
||||
"raw_metadata": self.raw_metadata,
|
||||
}
|
||||
120
api/services/token_sources/claude.py
Normal file
120
api/services/token_sources/claude.py
Normal file
@@ -0,0 +1,120 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from api.services.token_sources.base import TokenSourceRecord, parse_iso
|
||||
|
||||
PARSER_VERSION = "claude-transcript-v1"
|
||||
|
||||
|
||||
def iter_claude_transcript_files(claude_home: Path) -> list[Path]:
|
||||
projects = claude_home / "projects"
|
||||
if not projects.is_dir():
|
||||
return []
|
||||
return sorted(projects.glob("**/*.jsonl"))
|
||||
|
||||
|
||||
def _usage_from_entry(entry: dict[str, Any]) -> dict[str, Any]:
|
||||
message = entry.get("message")
|
||||
if isinstance(message, dict) and isinstance(message.get("usage"), dict):
|
||||
return message["usage"]
|
||||
usage = entry.get("usage")
|
||||
return usage if isinstance(usage, dict) else {}
|
||||
|
||||
|
||||
def parse_claude_transcript(path: Path, since: datetime) -> TokenSourceRecord | None:
|
||||
session_id = path.stem
|
||||
cwd: str | None = None
|
||||
model: str | None = None
|
||||
first_at: datetime | None = None
|
||||
last_at: datetime | None = None
|
||||
tokens_in = tokens_out = 0
|
||||
cached_input_tokens = 0
|
||||
raw_total_tokens = 0
|
||||
usage_records = 0
|
||||
malformed_lines = 0
|
||||
|
||||
try:
|
||||
handle = path.open("r", encoding="utf-8", errors="ignore")
|
||||
except OSError:
|
||||
return None
|
||||
|
||||
with handle:
|
||||
for line in handle:
|
||||
try:
|
||||
entry: dict[str, Any] = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
malformed_lines += 1
|
||||
continue
|
||||
|
||||
ts = entry.get("timestamp") or entry.get("created_at")
|
||||
parsed_ts = parse_iso(ts) if isinstance(ts, str) else None
|
||||
if parsed_ts:
|
||||
first_at = first_at or parsed_ts
|
||||
last_at = parsed_ts
|
||||
|
||||
session_id = str(entry.get("session_id") or entry.get("conversation_id") or session_id)
|
||||
cwd = entry.get("cwd") or entry.get("project_cwd") or cwd
|
||||
model = entry.get("model") or model
|
||||
message = entry.get("message")
|
||||
if isinstance(message, dict):
|
||||
model = message.get("model") or model
|
||||
|
||||
usage = _usage_from_entry(entry)
|
||||
if not usage:
|
||||
continue
|
||||
if parsed_ts is not None and parsed_ts < since:
|
||||
continue
|
||||
|
||||
input_tokens = int(usage.get("input_tokens") or 0)
|
||||
cache_creation = int(usage.get("cache_creation_input_tokens") or 0)
|
||||
cache_read = int(usage.get("cache_read_input_tokens") or 0)
|
||||
output_tokens = int(usage.get("output_tokens") or 0)
|
||||
if input_tokens == 0 and output_tokens == 0 and cache_creation == 0 and cache_read == 0:
|
||||
continue
|
||||
tokens_in += input_tokens
|
||||
tokens_out += output_tokens
|
||||
cached_input_tokens += cache_creation + cache_read
|
||||
raw_total_tokens += input_tokens + cache_creation + cache_read + output_tokens
|
||||
usage_records += 1
|
||||
|
||||
if usage_records == 0 or tokens_in + tokens_out + cached_input_tokens == 0:
|
||||
return None
|
||||
|
||||
return TokenSourceRecord(
|
||||
source_provider="claude_transcript",
|
||||
source_id=f"claude:{session_id}",
|
||||
source_path=path,
|
||||
source_created_at=last_at,
|
||||
session_id=session_id,
|
||||
cwd=cwd,
|
||||
model=model,
|
||||
agent="claude",
|
||||
tokens_in=tokens_in,
|
||||
tokens_out=tokens_out,
|
||||
cached_input_tokens=cached_input_tokens,
|
||||
raw_total_tokens=raw_total_tokens or None,
|
||||
parser_version=PARSER_VERSION,
|
||||
confidence=1.0,
|
||||
raw_metadata={
|
||||
"started_at": first_at.isoformat() if first_at else None,
|
||||
"usage_records": usage_records,
|
||||
"malformed_lines": malformed_lines,
|
||||
"source_file_name": path.name,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def collect_claude_transcripts(claude_home: Path, since: datetime) -> list[TokenSourceRecord]:
|
||||
by_id: dict[str, TokenSourceRecord] = {}
|
||||
for path in iter_claude_transcript_files(claude_home):
|
||||
parsed = parse_claude_transcript(path, since)
|
||||
if parsed is None:
|
||||
continue
|
||||
current = by_id.get(parsed.source_id)
|
||||
if current is None or parsed.tokens_total > current.tokens_total:
|
||||
by_id[parsed.source_id] = parsed
|
||||
return sorted(by_id.values(), key=lambda item: item.source_created_at or datetime.min.replace(tzinfo=since.tzinfo))
|
||||
124
api/services/token_sources/codex.py
Normal file
124
api/services/token_sources/codex.py
Normal file
@@ -0,0 +1,124 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from api.services.token_sources.base import TokenSourceRecord, parse_iso
|
||||
|
||||
PARSER_VERSION = "codex-desktop-v1"
|
||||
|
||||
|
||||
def iter_codex_session_files(codex_home: Path) -> list[Path]:
|
||||
files: list[Path] = []
|
||||
sessions = codex_home / "sessions"
|
||||
archived = codex_home / "archived_sessions"
|
||||
if sessions.is_dir():
|
||||
files.extend(sorted(sessions.glob("*/*/*/*.jsonl")))
|
||||
if archived.is_dir():
|
||||
files.extend(sorted(archived.glob("*.jsonl")))
|
||||
return files
|
||||
|
||||
|
||||
def parse_codex_session(path: Path, since: datetime) -> TokenSourceRecord | None:
|
||||
fallback_id = path.stem.removeprefix("rollout-")
|
||||
session_id = fallback_id
|
||||
started_at: datetime | None = None
|
||||
last_at: datetime | None = None
|
||||
cwd: str | None = None
|
||||
model: str | None = None
|
||||
tokens_in = tokens_out = 0
|
||||
cached_input_tokens = reasoning_output_tokens = 0
|
||||
raw_total_tokens = 0
|
||||
usage_records = 0
|
||||
malformed_lines = 0
|
||||
|
||||
try:
|
||||
handle = path.open("r", encoding="utf-8", errors="ignore")
|
||||
except OSError:
|
||||
return None
|
||||
|
||||
with handle:
|
||||
for line in handle:
|
||||
try:
|
||||
entry: dict[str, Any] = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
malformed_lines += 1
|
||||
continue
|
||||
|
||||
ts = entry.get("timestamp")
|
||||
parsed_ts = parse_iso(ts) if isinstance(ts, str) else None
|
||||
if parsed_ts:
|
||||
last_at = parsed_ts
|
||||
started_at = started_at or parsed_ts
|
||||
|
||||
payload = entry.get("payload") or {}
|
||||
if entry.get("type") == "session_meta":
|
||||
meta_id = payload.get("id")
|
||||
if meta_id:
|
||||
session_id = str(meta_id)
|
||||
cwd = payload.get("cwd") or cwd
|
||||
meta_ts = payload.get("timestamp")
|
||||
if isinstance(meta_ts, str):
|
||||
started_at = parse_iso(meta_ts)
|
||||
elif entry.get("type") == "turn_context":
|
||||
cwd = payload.get("cwd") or cwd
|
||||
model = payload.get("model") or model
|
||||
elif entry.get("type") == "event_msg" and payload.get("type") == "token_count":
|
||||
if parsed_ts is None or parsed_ts < since:
|
||||
continue
|
||||
info = payload.get("info") or {}
|
||||
last = info.get("last_token_usage") or {}
|
||||
if not isinstance(last, dict):
|
||||
continue
|
||||
input_tokens = int(last.get("input_tokens") or 0)
|
||||
output_tokens = int(last.get("output_tokens") or 0)
|
||||
if input_tokens == 0 and output_tokens == 0:
|
||||
continue
|
||||
tokens_in += input_tokens
|
||||
tokens_out += output_tokens
|
||||
cached_input_tokens += int(last.get("cached_input_tokens") or 0)
|
||||
reasoning_output_tokens += int(last.get("reasoning_output_tokens") or 0)
|
||||
raw_total_tokens += int(last.get("total_tokens") or input_tokens + output_tokens)
|
||||
usage_records += 1
|
||||
last_at = parsed_ts
|
||||
|
||||
if usage_records == 0 or tokens_in + tokens_out == 0:
|
||||
return None
|
||||
|
||||
return TokenSourceRecord(
|
||||
source_provider="codex_session",
|
||||
source_id=f"codex:{session_id}",
|
||||
source_path=path,
|
||||
source_created_at=last_at,
|
||||
session_id=session_id,
|
||||
cwd=cwd,
|
||||
model=model,
|
||||
agent="codex",
|
||||
tokens_in=tokens_in,
|
||||
tokens_out=tokens_out,
|
||||
cached_input_tokens=cached_input_tokens,
|
||||
reasoning_output_tokens=reasoning_output_tokens,
|
||||
raw_total_tokens=raw_total_tokens or None,
|
||||
parser_version=PARSER_VERSION,
|
||||
confidence=1.0,
|
||||
raw_metadata={
|
||||
"started_at": started_at.isoformat() if started_at else None,
|
||||
"usage_records": usage_records,
|
||||
"malformed_lines": malformed_lines,
|
||||
"source_file_name": path.name,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def collect_codex_sessions(codex_home: Path, since: datetime) -> list[TokenSourceRecord]:
|
||||
by_id: dict[str, TokenSourceRecord] = {}
|
||||
for path in iter_codex_session_files(codex_home):
|
||||
parsed = parse_codex_session(path, since)
|
||||
if parsed is None:
|
||||
continue
|
||||
current = by_id.get(parsed.source_id)
|
||||
if current is None or parsed.tokens_total > current.tokens_total:
|
||||
by_id[parsed.source_id] = parsed
|
||||
return sorted(by_id.values(), key=lambda item: item.source_created_at or datetime.min.replace(tzinfo=since.tzinfo))
|
||||
Reference in New Issue
Block a user