llm-connect/llm_connect/quality.py

"""Quality observations and append-only ledger support.

These primitives let callers record observed quality/cost outcomes for a
task type without baking consumer-specific routing policy into llm-connect.
"""

from __future__ import annotations

import json
import os
import threading
from contextlib import contextmanager
from dataclasses import dataclass, field
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Iterator, TextIO


_PATH_LOCKS: dict[Path, threading.Lock] = {}
_PATH_LOCKS_GUARD = threading.Lock()


def _utc_now() -> datetime:
    return datetime.now(timezone.utc)


def _normalise_datetime(value: datetime | str) -> datetime:
    if isinstance(value, datetime):
        dt = value
    elif isinstance(value, str):
        dt = datetime.fromisoformat(value.replace("Z", "+00:00"))
    else:
        raise TypeError(f"Expected datetime or ISO string, got {type(value).__name__}")

    if dt.tzinfo is None:
        return dt.replace(tzinfo=timezone.utc)
    return dt.astimezone(timezone.utc)


def _serialise_datetime(value: datetime) -> str:
    return _normalise_datetime(value).isoformat().replace("+00:00", "Z")


def _validate_non_negative_int(name: str, value: int) -> None:
    if not isinstance(value, int) or value < 0:
        raise ValueError(f"{name} must be a non-negative integer")


def _validate_non_negative_float(name: str, value: float) -> None:
    if not isinstance(value, (int, float)) or float(value) < 0:
        raise ValueError(f"{name} must be a non-negative number")


def _path_lock(path: Path) -> threading.Lock:
    resolved = path.resolve()
    with _PATH_LOCKS_GUARD:
        lock = _PATH_LOCKS.get(resolved)
        if lock is None:
            lock = threading.Lock()
            _PATH_LOCKS[resolved] = lock
        return lock


def _lock_file(handle: TextIO) -> None:
    if os.name == "nt":
        import msvcrt

        msvcrt.locking(handle.fileno(), msvcrt.LK_LOCK, 1)
    else:
        import fcntl

        fcntl.flock(handle.fileno(), fcntl.LOCK_EX)


def _unlock_file(handle: TextIO) -> None:
    if os.name == "nt":
        import msvcrt

        msvcrt.locking(handle.fileno(), msvcrt.LK_UNLCK, 1)
    else:
        import fcntl

        fcntl.flock(handle.fileno(), fcntl.LOCK_UN)


@contextmanager
def _locked_file(path: Path, mode: str) -> Iterator[TextIO]:
    path.parent.mkdir(parents=True, exist_ok=True)
    local_lock = _path_lock(path)
    with local_lock:
        with path.open(mode, encoding="utf-8") as handle:
            _lock_file(handle)
            try:
                yield handle
            finally:
                _unlock_file(handle)


@dataclass(frozen=True)
class QualityObservation:
    """Observed quality/cost outcome for one adapter on one task type."""

    task_type: str
    adapter_id: str
    model_id: str
    cost_usd: float
    quality_score: float
    latency_ms: float
    tokens_in: int
    tokens_out: int
    baseline_adapter_id: str | None = None
    recorded_at: datetime = field(default_factory=_utc_now)
    tags: dict[str, Any] = field(default_factory=dict)

    def __post_init__(self) -> None:
        for name in ("task_type", "adapter_id", "model_id"):
            if not str(getattr(self, name)).strip():
                raise ValueError(f"{name} must be a non-empty string")

        _validate_non_negative_float("cost_usd", self.cost_usd)
        _validate_non_negative_float("latency_ms", self.latency_ms)
        _validate_non_negative_int("tokens_in", self.tokens_in)
        _validate_non_negative_int("tokens_out", self.tokens_out)
        if not isinstance(self.quality_score, (int, float)):
            raise ValueError("quality_score must be a number between 0 and 1")
        if not 0 <= float(self.quality_score) <= 1:
            raise ValueError("quality_score must be between 0 and 1")

        object.__setattr__(self, "task_type", str(self.task_type))
        object.__setattr__(self, "adapter_id", str(self.adapter_id))
        object.__setattr__(self, "model_id", str(self.model_id))
        object.__setattr__(self, "cost_usd", float(self.cost_usd))
        object.__setattr__(self, "quality_score", float(self.quality_score))
        object.__setattr__(self, "latency_ms", float(self.latency_ms))
        object.__setattr__(self, "recorded_at", _normalise_datetime(self.recorded_at))
        object.__setattr__(self, "tags", dict(self.tags))

    @property
    def total_tokens(self) -> int:
        """Return input plus output tokens."""
        return self.tokens_in + self.tokens_out

    def to_dict(self) -> dict[str, Any]:
        """Convert to a JSON-serialisable dictionary."""
        return {
            "task_type": self.task_type,
            "adapter_id": self.adapter_id,
            "model_id": self.model_id,
            "cost_usd": self.cost_usd,
            "quality_score": self.quality_score,
            "latency_ms": self.latency_ms,
            "tokens_in": self.tokens_in,
            "tokens_out": self.tokens_out,
            "baseline_adapter_id": self.baseline_adapter_id,
            "recorded_at": _serialise_datetime(self.recorded_at),
            "tags": dict(self.tags),
        }

    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "QualityObservation":
        """Create an observation from a JSON-decoded dictionary."""
        return cls(
            task_type=data["task_type"],
            adapter_id=data["adapter_id"],
            model_id=data["model_id"],
            cost_usd=data["cost_usd"],
            quality_score=data["quality_score"],
            latency_ms=data["latency_ms"],
            tokens_in=data["tokens_in"],
            tokens_out=data["tokens_out"],
            baseline_adapter_id=data.get("baseline_adapter_id"),
            recorded_at=data.get("recorded_at", _utc_now()),
            tags=data.get("tags") or {},
        )


def is_stale(
    observation: QualityObservation,
    max_age: timedelta,
    *,
    now: datetime | None = None,
) -> bool:
    """Return whether *observation* is older than *max_age*."""
    if max_age.total_seconds() < 0:
        raise ValueError("max_age must be non-negative")
    reference = _normalise_datetime(now or _utc_now())
    return observation.recorded_at < reference - max_age


class QualityLedger:
    """Append-only JSONL store for :class:`QualityObservation` records."""

    def __init__(self, path: str | Path):
        self._path = Path(path)

    @property
    def path(self) -> Path:
        """Ledger file path."""
        return self._path

    def append(self, observation: QualityObservation) -> None:
        """Append one observation as a locked JSONL record."""
        line = json.dumps(observation.to_dict(), sort_keys=True, separators=(",", ":"))
        with _locked_file(self._path, "a") as handle:
            handle.write(line + "\n")
            handle.flush()
            os.fsync(handle.fileno())

    def read_all(self) -> list[QualityObservation]:
        """Return all parseable observations, skipping malformed lines."""
        observations, _ = self._read_with_malformed_count()
        return observations

    def malformed_count(self) -> int:
        """Return the number of malformed lines currently skipped by reads."""
        _, malformed = self._read_with_malformed_count()
        return malformed

    def by_task_type(self, task_type: str) -> list[QualityObservation]:
        """Return observations matching *task_type*."""
        return [obs for obs in self.read_all() if obs.task_type == task_type]

    def recent(
        self,
        limit: int | None = None,
        *,
        task_type: str | None = None,
        adapter_id: str | None = None,
        since: datetime | None = None,
    ) -> list[QualityObservation]:
        """Return newest observations first, optionally filtered."""
        if limit is not None and limit < 0:
            raise ValueError("limit must be non-negative")

        cutoff = _normalise_datetime(since) if since is not None else None
        observations = self.read_all()
        if task_type is not None:
            observations = [obs for obs in observations if obs.task_type == task_type]
        if adapter_id is not None:
            observations = [obs for obs in observations if obs.adapter_id == adapter_id]
        if cutoff is not None:
            observations = [obs for obs in observations if obs.recorded_at >= cutoff]

        observations.sort(key=lambda obs: obs.recorded_at, reverse=True)
        if limit is None:
            return observations
        return observations[:limit]

    def mean_quality(
        self,
        task_type: str,
        *,
        adapter_id: str | None = None,
        model_id: str | None = None,
        max_age: timedelta | None = None,
        min_observations: int = 1,
    ) -> float | None:
        """Return mean quality for matching observations, or ``None`` if absent."""
        if min_observations <= 0:
            raise ValueError("min_observations must be positive")

        observations = self.by_task_type(task_type)
        if adapter_id is not None:
            observations = [obs for obs in observations if obs.adapter_id == adapter_id]
        if model_id is not None:
            observations = [obs for obs in observations if obs.model_id == model_id]
        if max_age is not None:
            observations = [obs for obs in observations if not is_stale(obs, max_age)]

        if len(observations) < min_observations:
            return None
        return sum(obs.quality_score for obs in observations) / len(observations)

    def prune_before(self, timestamp: datetime) -> int:
        """Remove valid observations recorded before *timestamp*.

        Malformed lines are preserved because their timestamp cannot be trusted.
        Returns the number of valid observation records removed.
        """
        cutoff = _normalise_datetime(timestamp)
        removed = 0
        with _locked_file(self._path, "a+") as handle:
            handle.seek(0)
            lines = handle.readlines()
            kept: list[str] = []
            for line in lines:
                try:
                    obs = QualityObservation.from_dict(json.loads(line))
                except (json.JSONDecodeError, KeyError, TypeError, ValueError):
                    kept.append(line)
                    continue
                if obs.recorded_at < cutoff:
                    removed += 1
                else:
                    kept.append(line)

            handle.seek(0)
            handle.truncate()
            handle.writelines(kept)
            handle.flush()
            os.fsync(handle.fileno())
        return removed

    def _read_with_malformed_count(self) -> tuple[list[QualityObservation], int]:
        if not self._path.is_file():
            return [], 0

        observations: list[QualityObservation] = []
        malformed = 0
        with _locked_file(self._path, "r") as handle:
            for line in handle:
                if not line.strip():
                    continue
                try:
                    observations.append(QualityObservation.from_dict(json.loads(line)))
                except (json.JSONDecodeError, KeyError, TypeError, ValueError):
                    malformed += 1
        return observations, malformed