Fixed and improved token tracking

2026-05-23 13:59:05 +02:00
parent dd3279ea1a
commit c12091c2eb
29 changed files with 3549 additions and 278 deletions
--- a/api/services/token_sources/init.py
+++ b/api/services/token_sources/init.py
@@ -0,0 +1,16 @@
+"""Token source adapters for measured agent usage."""
+
+from api.services.token_sources.base import TokenSourceRecord, parse_iso
+from api.services.token_sources.codex import collect_codex_sessions, iter_codex_session_files, parse_codex_session
+from api.services.token_sources.claude import collect_claude_transcripts, iter_claude_transcript_files, parse_claude_transcript
+
+__all__ = [
+    "TokenSourceRecord",
+    "parse_iso",
+    "collect_codex_sessions",
+    "iter_codex_session_files",
+    "parse_codex_session",
+    "collect_claude_transcripts",
+    "iter_claude_transcript_files",
+    "parse_claude_transcript",
+]
--- a/api/services/token_sources/attribution.py
+++ b/api/services/token_sources/attribution.py
@@ -0,0 +1,171 @@
+from __future__ import annotations
+
+import subprocess
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+
+
+@dataclass(frozen=True)
+class RepoRef:
+    repo_id: str
+    slug: str
+    local_path: str | None = None
+    host_paths: dict[str, Any] | None = None
+    remote_url: str | None = None
+    git_fingerprint: str | None = None
+
+
+@dataclass(frozen=True)
+class RepoMatch:
+    repo_id: str
+    slug: str
+    method: str
+    confidence: float
+
+
+def normalise_cwd(raw: str | None) -> str | None:
+    if not raw:
+        return None
+    value = raw.replace("\\", "/")
+    prefixes = (
+        "//wsl.localhost/Ubuntu-24.04",
+        "//wsl$/Ubuntu-24.04",
+    )
+    for prefix in prefixes:
+        if value.startswith(prefix):
+            return value[len(prefix):] or "/"
+    if len(value) >= 3 and value[1:3] == ":/":
+        drive = value[0].lower()
+        return f"/mnt/{drive}{value[2:]}"
+    return value
+
+
+def normalise_remote_url(raw: str | None) -> str | None:
+    if not raw:
+        return None
+    value = raw.strip()
+    if value.endswith(".git"):
+        value = value[:-4]
+    if value.startswith("git@") and ":" in value:
+        host, path = value[4:].split(":", 1)
+        value = f"ssh://{host}/{path}"
+    return value.lower().rstrip("/")
+
+
+def repo_refs_from_api(repos: list[dict[str, Any]]) -> list[RepoRef]:
+    refs = []
+    for repo in repos:
+        repo_id = repo.get("id")
+        slug = repo.get("slug")
+        if not repo_id or not slug:
+            continue
+        refs.append(
+            RepoRef(
+                repo_id=str(repo_id),
+                slug=str(slug),
+                local_path=repo.get("local_path"),
+                host_paths=repo.get("host_paths") if isinstance(repo.get("host_paths"), dict) else {},
+                remote_url=repo.get("remote_url"),
+                git_fingerprint=repo.get("git_fingerprint"),
+            )
+        )
+    return refs
+
+
+def _git(cwd: str, *args: str) -> str | None:
+    try:
+        result = subprocess.run(
+            ["git", *args],
+            cwd=cwd,
+            check=False,
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+    except (OSError, subprocess.SubprocessError):
+        return None
+    if result.returncode != 0:
+        return None
+    value = result.stdout.strip().splitlines()
+    return value[0] if value else None
+
+
+def git_fingerprint_for_path(cwd: str | None) -> str | None:
+    path = normalise_cwd(cwd)
+    if not path or not Path(path).exists():
+        return None
+    root = _git(path, "rev-parse", "--show-toplevel")
+    if not root:
+        return None
+    return _git(root, "rev-list", "--max-parents=0", "HEAD")
+
+
+def git_remote_for_path(cwd: str | None) -> str | None:
+    path = normalise_cwd(cwd)
+    if not path or not Path(path).exists():
+        return None
+    root = _git(path, "rev-parse", "--show-toplevel")
+    if not root:
+        return None
+    return _git(root, "remote", "get-url", "origin")
+
+
+def _repo_paths(repo: RepoRef) -> list[str]:
+    paths = [repo.local_path]
+    if repo.host_paths:
+        paths.extend(str(v) for v in repo.host_paths.values() if v)
+    result = []
+    for raw in paths:
+        path = normalise_cwd(str(raw)) if raw and raw != "(unknown)" else None
+        if path:
+            result.append(path.rstrip("/"))
+    return result
+
+
+def resolve_repo(cwd: str | None, repos: list[RepoRef]) -> RepoMatch | None:
+    path = normalise_cwd(cwd)
+    fingerprint = git_fingerprint_for_path(path)
+    remote = normalise_remote_url(git_remote_for_path(path))
+
+    if fingerprint:
+        candidates = [repo for repo in repos if repo.git_fingerprint == fingerprint]
+        if len(candidates) == 1:
+            repo = candidates[0]
+            return RepoMatch(repo.repo_id, repo.slug, "git_fingerprint", 0.98)
+        if remote:
+            remote_candidates = [
+                repo for repo in candidates
+                if normalise_remote_url(repo.remote_url) == remote
+            ]
+            if len(remote_candidates) == 1:
+                repo = remote_candidates[0]
+                return RepoMatch(repo.repo_id, repo.slug, "git_fingerprint_remote", 0.99)
+
+    if remote:
+        candidates = [repo for repo in repos if normalise_remote_url(repo.remote_url) == remote]
+        if len(candidates) == 1:
+            repo = candidates[0]
+            return RepoMatch(repo.repo_id, repo.slug, "remote_url", 0.90)
+
+    if not path:
+        return None
+
+    path_matches: list[tuple[str, RepoRef]] = []
+    for repo in repos:
+        for repo_path in _repo_paths(repo):
+            if path == repo_path or path.startswith(f"{repo_path}/"):
+                path_matches.append((repo_path, repo))
+    if not path_matches:
+        return None
+    path_matches.sort(key=lambda item: len(item[0]), reverse=True)
+    exact = [item for item in path_matches if path == item[0]]
+    if exact:
+        basename = Path(path).name
+        for _, repo in exact:
+            if repo.slug == basename:
+                return RepoMatch(repo.repo_id, repo.slug, "path_exact_slug", 0.85)
+        repo = exact[0][1]
+        return RepoMatch(repo.repo_id, repo.slug, "path_exact", 0.80)
+    repo = path_matches[0][1]
+    return RepoMatch(repo.repo_id, repo.slug, "path_prefix", 0.75)
--- a/api/services/token_sources/base.py
+++ b/api/services/token_sources/base.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+
+def parse_iso(value: str) -> datetime:
+    raw = value.strip()
+    if raw.endswith("Z"):
+        raw = raw[:-1] + "+00:00"
+    if "T" not in raw:
+        raw = f"{raw}T00:00:00+00:00"
+    parsed = datetime.fromisoformat(raw)
+    if parsed.tzinfo is None:
+        parsed = parsed.replace(tzinfo=timezone.utc)
+    return parsed.astimezone(timezone.utc)
+
+
+@dataclass
+class TokenSourceRecord:
+    source_provider: str
+    source_id: str
+    source_path: Path
+    source_created_at: datetime | None
+    session_id: str | None = None
+    cwd: str | None = None
+    model: str | None = None
+    agent: str | None = None
+    tokens_in: int = 0
+    tokens_out: int = 0
+    cached_input_tokens: int = 0
+    reasoning_output_tokens: int = 0
+    raw_total_tokens: int | None = None
+    parser_version: str | None = None
+    confidence: float = 1.0
+    raw_metadata: dict[str, Any] = field(default_factory=dict)
+
+    @property
+    def tokens_total(self) -> int:
+        return self.tokens_in + self.tokens_out
+
+    def to_token_event_payload(self, repo_id: str | None = None) -> dict[str, Any]:
+        raw_total = self.raw_total_tokens
+        if raw_total is None:
+            raw_total = self.tokens_in + self.tokens_out
+        created_at = self.source_created_at.isoformat() if self.source_created_at else None
+        return {
+            "tokens_in": self.tokens_in,
+            "tokens_out": self.tokens_out,
+            "repo_id": repo_id,
+            "session_id": self.session_id,
+            "model": self.model,
+            "agent": self.agent,
+            "ref_type": "session",
+            "ref_id": self.source_id,
+            "note": f"measured:{self.source_provider}",
+            "created_at": created_at,
+            "measurement_kind": "measured",
+            "source_provider": self.source_provider,
+            "source_id": self.source_id,
+            "source_path": str(self.source_path),
+            "source_created_at": created_at,
+            "parser_version": self.parser_version,
+            "confidence": self.confidence,
+            "cached_input_tokens": self.cached_input_tokens,
+            "reasoning_output_tokens": self.reasoning_output_tokens,
+            "raw_total_tokens": raw_total,
+            "raw_metadata": self.raw_metadata,
+        }
--- a/api/services/token_sources/claude.py
+++ b/api/services/token_sources/claude.py
@@ -0,0 +1,120 @@
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from api.services.token_sources.base import TokenSourceRecord, parse_iso
+
+PARSER_VERSION = "claude-transcript-v1"
+
+
+def iter_claude_transcript_files(claude_home: Path) -> list[Path]:
+    projects = claude_home / "projects"
+    if not projects.is_dir():
+        return []
+    return sorted(projects.glob("**/*.jsonl"))
+
+
+def _usage_from_entry(entry: dict[str, Any]) -> dict[str, Any]:
+    message = entry.get("message")
+    if isinstance(message, dict) and isinstance(message.get("usage"), dict):
+        return message["usage"]
+    usage = entry.get("usage")
+    return usage if isinstance(usage, dict) else {}
+
+
+def parse_claude_transcript(path: Path, since: datetime) -> TokenSourceRecord | None:
+    session_id = path.stem
+    cwd: str | None = None
+    model: str | None = None
+    first_at: datetime | None = None
+    last_at: datetime | None = None
+    tokens_in = tokens_out = 0
+    cached_input_tokens = 0
+    raw_total_tokens = 0
+    usage_records = 0
+    malformed_lines = 0
+
+    try:
+        handle = path.open("r", encoding="utf-8", errors="ignore")
+    except OSError:
+        return None
+
+    with handle:
+        for line in handle:
+            try:
+                entry: dict[str, Any] = json.loads(line)
+            except json.JSONDecodeError:
+                malformed_lines += 1
+                continue
+
+            ts = entry.get("timestamp") or entry.get("created_at")
+            parsed_ts = parse_iso(ts) if isinstance(ts, str) else None
+            if parsed_ts:
+                first_at = first_at or parsed_ts
+                last_at = parsed_ts
+
+            session_id = str(entry.get("session_id") or entry.get("conversation_id") or session_id)
+            cwd = entry.get("cwd") or entry.get("project_cwd") or cwd
+            model = entry.get("model") or model
+            message = entry.get("message")
+            if isinstance(message, dict):
+                model = message.get("model") or model
+
+            usage = _usage_from_entry(entry)
+            if not usage:
+                continue
+            if parsed_ts is not None and parsed_ts < since:
+                continue
+
+            input_tokens = int(usage.get("input_tokens") or 0)
+            cache_creation = int(usage.get("cache_creation_input_tokens") or 0)
+            cache_read = int(usage.get("cache_read_input_tokens") or 0)
+            output_tokens = int(usage.get("output_tokens") or 0)
+            if input_tokens == 0 and output_tokens == 0 and cache_creation == 0 and cache_read == 0:
+                continue
+            tokens_in += input_tokens
+            tokens_out += output_tokens
+            cached_input_tokens += cache_creation + cache_read
+            raw_total_tokens += input_tokens + cache_creation + cache_read + output_tokens
+            usage_records += 1
+
+    if usage_records == 0 or tokens_in + tokens_out + cached_input_tokens == 0:
+        return None
+
+    return TokenSourceRecord(
+        source_provider="claude_transcript",
+        source_id=f"claude:{session_id}",
+        source_path=path,
+        source_created_at=last_at,
+        session_id=session_id,
+        cwd=cwd,
+        model=model,
+        agent="claude",
+        tokens_in=tokens_in,
+        tokens_out=tokens_out,
+        cached_input_tokens=cached_input_tokens,
+        raw_total_tokens=raw_total_tokens or None,
+        parser_version=PARSER_VERSION,
+        confidence=1.0,
+        raw_metadata={
+            "started_at": first_at.isoformat() if first_at else None,
+            "usage_records": usage_records,
+            "malformed_lines": malformed_lines,
+            "source_file_name": path.name,
+        },
+    )
+
+
+def collect_claude_transcripts(claude_home: Path, since: datetime) -> list[TokenSourceRecord]:
+    by_id: dict[str, TokenSourceRecord] = {}
+    for path in iter_claude_transcript_files(claude_home):
+        parsed = parse_claude_transcript(path, since)
+        if parsed is None:
+            continue
+        current = by_id.get(parsed.source_id)
+        if current is None or parsed.tokens_total > current.tokens_total:
+            by_id[parsed.source_id] = parsed
+    return sorted(by_id.values(), key=lambda item: item.source_created_at or datetime.min.replace(tzinfo=since.tzinfo))
--- a/api/services/token_sources/codex.py
+++ b/api/services/token_sources/codex.py
@@ -0,0 +1,124 @@
+from __future__ import annotations
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+from api.services.token_sources.base import TokenSourceRecord, parse_iso
+
+PARSER_VERSION = "codex-desktop-v1"
+
+
+def iter_codex_session_files(codex_home: Path) -> list[Path]:
+    files: list[Path] = []
+    sessions = codex_home / "sessions"
+    archived = codex_home / "archived_sessions"
+    if sessions.is_dir():
+        files.extend(sorted(sessions.glob("*/*/*/*.jsonl")))
+    if archived.is_dir():
+        files.extend(sorted(archived.glob("*.jsonl")))
+    return files
+
+
+def parse_codex_session(path: Path, since: datetime) -> TokenSourceRecord | None:
+    fallback_id = path.stem.removeprefix("rollout-")
+    session_id = fallback_id
+    started_at: datetime | None = None
+    last_at: datetime | None = None
+    cwd: str | None = None
+    model: str | None = None
+    tokens_in = tokens_out = 0
+    cached_input_tokens = reasoning_output_tokens = 0
+    raw_total_tokens = 0
+    usage_records = 0
+    malformed_lines = 0
+
+    try:
+        handle = path.open("r", encoding="utf-8", errors="ignore")
+    except OSError:
+        return None
+
+    with handle:
+        for line in handle:
+            try:
+                entry: dict[str, Any] = json.loads(line)
+            except json.JSONDecodeError:
+                malformed_lines += 1
+                continue
+
+            ts = entry.get("timestamp")
+            parsed_ts = parse_iso(ts) if isinstance(ts, str) else None
+            if parsed_ts:
+                last_at = parsed_ts
+                started_at = started_at or parsed_ts
+
+            payload = entry.get("payload") or {}
+            if entry.get("type") == "session_meta":
+                meta_id = payload.get("id")
+                if meta_id:
+                    session_id = str(meta_id)
+                cwd = payload.get("cwd") or cwd
+                meta_ts = payload.get("timestamp")
+                if isinstance(meta_ts, str):
+                    started_at = parse_iso(meta_ts)
+            elif entry.get("type") == "turn_context":
+                cwd = payload.get("cwd") or cwd
+                model = payload.get("model") or model
+            elif entry.get("type") == "event_msg" and payload.get("type") == "token_count":
+                if parsed_ts is None or parsed_ts < since:
+                    continue
+                info = payload.get("info") or {}
+                last = info.get("last_token_usage") or {}
+                if not isinstance(last, dict):
+                    continue
+                input_tokens = int(last.get("input_tokens") or 0)
+                output_tokens = int(last.get("output_tokens") or 0)
+                if input_tokens == 0 and output_tokens == 0:
+                    continue
+                tokens_in += input_tokens
+                tokens_out += output_tokens
+                cached_input_tokens += int(last.get("cached_input_tokens") or 0)
+                reasoning_output_tokens += int(last.get("reasoning_output_tokens") or 0)
+                raw_total_tokens += int(last.get("total_tokens") or input_tokens + output_tokens)
+                usage_records += 1
+                last_at = parsed_ts
+
+    if usage_records == 0 or tokens_in + tokens_out == 0:
+        return None
+
+    return TokenSourceRecord(
+        source_provider="codex_session",
+        source_id=f"codex:{session_id}",
+        source_path=path,
+        source_created_at=last_at,
+        session_id=session_id,
+        cwd=cwd,
+        model=model,
+        agent="codex",
+        tokens_in=tokens_in,
+        tokens_out=tokens_out,
+        cached_input_tokens=cached_input_tokens,
+        reasoning_output_tokens=reasoning_output_tokens,
+        raw_total_tokens=raw_total_tokens or None,
+        parser_version=PARSER_VERSION,
+        confidence=1.0,
+        raw_metadata={
+            "started_at": started_at.isoformat() if started_at else None,
+            "usage_records": usage_records,
+            "malformed_lines": malformed_lines,
+            "source_file_name": path.name,
+        },
+    )
+
+
+def collect_codex_sessions(codex_home: Path, since: datetime) -> list[TokenSourceRecord]:
+    by_id: dict[str, TokenSourceRecord] = {}
+    for path in iter_codex_session_files(codex_home):
+        parsed = parse_codex_session(path, since)
+        if parsed is None:
+            continue
+        current = by_id.get(parsed.source_id)
+        if current is None or parsed.tokens_total > current.tokens_total:
+            by_id[parsed.source_id] = parsed
+    return sorted(by_id.values(), key=lambda item: item.source_created_at or datetime.min.replace(tzinfo=since.tzinfo))