llm-connect/tests/test_problem_classes.py

from datetime import datetime, timezone

import pytest

from llm_connect.problem_classes import (
    EntityExtractionProblemClass,
    Observation,
    ProblemClassRegistry,
    TokenEstimate,
)
from llm_connect.quality import QualityObservation


DIMENSIONS_BY_CLASS = {
    "chunk-summarization": [
        {"chunk_words": 900, "template_words": 150},
        {"chunk_words": 400, "template_words": 125},
        {"chunk_words": 1200, "template_words": 200},
    ],
    "entity-extraction": [
        {"chunk_words": 900, "template_words": 200, "expected_entities": 4},
        {"chunk_words": 450, "template_words": 180, "expected_entities": 6},
        {"chunk_words": 1200, "template_words": 220, "expected_entities": 8},
    ],
    "relation-extraction": [
        {"chunk_words": 900, "template_words": 200, "expected_relations": 3},
        {"chunk_words": 450, "template_words": 180, "expected_relations": 5},
        {"chunk_words": 1200, "template_words": 220, "expected_relations": 7},
    ],
    "judge-eval": [
        {"artifact_words": 700, "template_words": 180, "n_criteria": 4},
        {"artifact_words": 300, "template_words": 160, "n_criteria": 5},
        {"artifact_words": 1100, "template_words": 200, "n_criteria": 6},
    ],
    "report-synthesis": [
        {"n_chunks": 5, "n_entities": 20, "n_relations": 8, "template_words": 250},
        {"n_chunks": 8, "n_entities": 30, "n_relations": 12, "template_words": 250},
        {"n_chunks": 2, "n_entities": 10, "n_relations": 3, "template_words": 180},
    ],
}


def test_default_registry_exposes_builtin_classes():
    registry = ProblemClassRegistry.default()

    assert set(registry.all()) == set(DIMENSIONS_BY_CLASS)
    assert registry.schema_version == 1


@pytest.mark.parametrize("name,dimensions_list", DIMENSIONS_BY_CLASS.items())
def test_builtin_estimators_produce_token_estimates(name, dimensions_list):
    problem_class = ProblemClassRegistry.default().get(name)

    estimate = problem_class.estimate(dimensions_list[0])

    assert isinstance(estimate, TokenEstimate)
    assert estimate.prompt_tokens >= 0
    assert estimate.completion_tokens >= 0
    assert 0 <= estimate.confidence <= 1


@pytest.mark.parametrize("name,dimensions_list", DIMENSIONS_BY_CLASS.items())
def test_fit_recovers_seeded_params_from_synthetic_observations(name, dimensions_list):
    seeded = ProblemClassRegistry.default().get(name)
    param_name = seeded.tunable_params[0]
    off_seed = type(seeded)(params={param_name: seeded.params[param_name] * 2})
    observations = []
    for dimensions in dimensions_list:
        estimate = seeded.estimate(dimensions)
        observations.append(
            Observation(
                dimensions=dimensions,
                prompt_tokens=estimate.prompt_tokens,
                completion_tokens=estimate.completion_tokens,
            )
        )

    fitted = off_seed.fit(observations, min_observations=3)

    assert fitted.params[param_name] == pytest.approx(seeded.params[param_name], rel=0.1)


def test_fit_uses_quality_ledger_observation_shape():
    problem_class = EntityExtractionProblemClass(params={"tokens_per_entity": 10})
    observations = [
        QualityObservation(
            task_type="extract",
            adapter_id="openrouter",
            model_id="openai/gpt-4o-mini",
            cost_usd=0.001,
            quality_score=0.9,
            latency_ms=100,
            tokens_in=500,
            tokens_out=350,
            recorded_at=datetime(2026, 5, 19, tzinfo=timezone.utc),
            tags={
                "problem_class": "entity-extraction",
                "dimensions": {
                    "chunk_words": 300,
                    "template_words": 100,
                    "expected_entities": 5,
                },
            },
        )
        for _ in range(3)
    ]

    fitted = problem_class.fit(observations)

    assert fitted.params["tokens_per_entity"] == pytest.approx(70)


def test_fit_keeps_seed_when_sample_is_too_small():
    problem_class = EntityExtractionProblemClass()
    estimate = problem_class.estimate(
        {"chunk_words": 300, "template_words": 100, "expected_entities": 5}
    )

    fitted = problem_class.fit(
        [
            Observation(
                dimensions={"chunk_words": 300, "template_words": 100, "expected_entities": 5},
                prompt_tokens=estimate.prompt_tokens,
                completion_tokens=estimate.completion_tokens,
            )
        ],
        min_observations=3,
    )

    assert fitted is problem_class


def test_missing_dimensions_are_rejected():
    problem_class = ProblemClassRegistry.default().get("chunk-summarization")

    with pytest.raises(ValueError, match="Missing dimensions"):
        problem_class.estimate({"chunk_words": 100})