Implement-LLM-WP-0005-cost-model-estimators

2026-05-19 05:02:20 +02:00
parent 0054afe689
commit c11c6afa3f
16 changed files with 1525 additions and 10 deletions
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -0,0 +1,54 @@
+import json
+from datetime import datetime, timezone
+
+from llm_connect.cli import main
+from llm_connect.quality import QualityLedger, QualityObservation
+
+
+def test_rates_show_json_outputs_default_registry(capsys):
+    assert main(["rates", "show", "--json"]) == 0
+
+    payload = json.loads(capsys.readouterr().out)
+
+    assert payload["openai/gpt-4o-mini"]["prompt_per_1k"] == 0.00015
+
+
+def test_classes_show_lists_builtins(capsys):
+    assert main(["classes", "show"]) == 0
+
+    output = capsys.readouterr().out
+
+    assert "chunk-summarization" in output
+    assert "entity-extraction" in output
+
+
+def test_classes_fit_reads_quality_ledger(tmp_path, capsys):
+    ledger = QualityLedger(tmp_path / "quality.jsonl")
+    for _ in range(3):
+        ledger.append(
+            QualityObservation(
+                task_type="extract",
+                adapter_id="openrouter",
+                model_id="openai/gpt-4o-mini",
+                cost_usd=0.001,
+                quality_score=0.9,
+                latency_ms=100,
+                tokens_in=500,
+                tokens_out=350,
+                recorded_at=datetime(2026, 5, 19, tzinfo=timezone.utc),
+                tags={
+                    "problem_class": "entity-extraction",
+                    "dimensions": {
+                        "chunk_words": 300,
+                        "template_words": 100,
+                        "expected_entities": 5,
+                    },
+                },
+            )
+        )
+
+    assert main(["classes", "fit", str(ledger.path), "--class", "entity-extraction", "--json"]) == 0
+
+    payload = json.loads(capsys.readouterr().out)
+
+    assert payload["entity-extraction"]["params"]["tokens_per_entity"] == 70
--- a/tests/test_costs.py
+++ b/tests/test_costs.py
@@ -0,0 +1,49 @@
+import pytest
+
+from llm_connect.costs import CostEstimate, CostModel, estimate_cost
+from llm_connect.rates import ModelRate, ModelRateRegistry
+
+
+def test_known_model_cost_matches_lefevre_smoke_budget():
+    estimate = estimate_cost("openai/gpt-4o-mini", 28_000, 7_500)
+
+    assert estimate.cost_source == "rate_table:openai/gpt-4o-mini"
+    assert estimate.cost_usd == pytest.approx(0.0087)
+    assert estimate.cost_usd == pytest.approx(0.009, rel=0.2)
+
+
+def test_unknown_model_returns_unknown_without_zeroing_cost():
+    estimate = estimate_cost("unknown/model", 100, 50)
+
+    assert estimate == CostEstimate(cost_usd=None, cost_source="unknown")
+
+
+def test_registry_override_controls_estimate():
+    registry = ModelRateRegistry(
+        {
+            "vendor/model": ModelRate(
+                "vendor/model",
+                prompt_per_1k=1.0,
+                completion_per_1k=2.0,
+            )
+        }
+    )
+
+    estimate = estimate_cost("vendor/model", 1_000, 500, registry=registry)
+
+    assert estimate.cost_usd == pytest.approx(2.0)
+    assert estimate.prompt_cost_usd == pytest.approx(1.0)
+    assert estimate.completion_cost_usd == pytest.approx(1.0)
+
+
+def test_zero_tokens_are_valid_and_cost_zero_for_known_model():
+    estimate = CostModel().estimate_cost("openai/gpt-4o-mini", 0, 0)
+
+    assert estimate.cost_usd == 0
+    assert estimate.prompt_cost_usd == 0
+    assert estimate.completion_cost_usd == 0
+
+
+def test_negative_tokens_are_rejected():
+    with pytest.raises(ValueError, match="prompt_tokens"):
+        estimate_cost("openai/gpt-4o-mini", -1, 0)
--- a/tests/test_package_exports.py
+++ b/tests/test_package_exports.py
@@ -24,3 +24,27 @@ def test_wp_0004_primitives_are_exported_from_package_root():
    for name in expected_names:
        assert hasattr(llm_connect, name)
        assert name in llm_connect.__all__
+
+
+def test_wp_0005_primitives_are_exported_from_package_root():
+    expected_names = [
+        "ModelRate",
+        "ModelRateRegistry",
+        "CostEstimate",
+        "CostModel",
+        "estimate_cost",
+        "TokenEstimate",
+        "Observation",
+        "ProblemClass",
+        "ProblemClassRegistry",
+        "default_problem_class_registry",
+        "ChunkSummarizationProblemClass",
+        "EntityExtractionProblemClass",
+        "RelationExtractionProblemClass",
+        "JudgeEvalProblemClass",
+        "ReportSynthesisProblemClass",
+    ]
+
+    for name in expected_names:
+        assert hasattr(llm_connect, name)
+        assert name in llm_connect.__all__
--- a/tests/test_problem_classes.py
+++ b/tests/test_problem_classes.py
@@ -0,0 +1,137 @@
+from datetime import datetime, timezone
+
+import pytest
+
+from llm_connect.problem_classes import (
+    EntityExtractionProblemClass,
+    Observation,
+    ProblemClassRegistry,
+    TokenEstimate,
+)
+from llm_connect.quality import QualityObservation
+
+
+DIMENSIONS_BY_CLASS = {
+    "chunk-summarization": [
+        {"chunk_words": 900, "template_words": 150},
+        {"chunk_words": 400, "template_words": 125},
+        {"chunk_words": 1200, "template_words": 200},
+    ],
+    "entity-extraction": [
+        {"chunk_words": 900, "template_words": 200, "expected_entities": 4},
+        {"chunk_words": 450, "template_words": 180, "expected_entities": 6},
+        {"chunk_words": 1200, "template_words": 220, "expected_entities": 8},
+    ],
+    "relation-extraction": [
+        {"chunk_words": 900, "template_words": 200, "expected_relations": 3},
+        {"chunk_words": 450, "template_words": 180, "expected_relations": 5},
+        {"chunk_words": 1200, "template_words": 220, "expected_relations": 7},
+    ],
+    "judge-eval": [
+        {"artifact_words": 700, "template_words": 180, "n_criteria": 4},
+        {"artifact_words": 300, "template_words": 160, "n_criteria": 5},
+        {"artifact_words": 1100, "template_words": 200, "n_criteria": 6},
+    ],
+    "report-synthesis": [
+        {"n_chunks": 5, "n_entities": 20, "n_relations": 8, "template_words": 250},
+        {"n_chunks": 8, "n_entities": 30, "n_relations": 12, "template_words": 250},
+        {"n_chunks": 2, "n_entities": 10, "n_relations": 3, "template_words": 180},
+    ],
+}
+
+
+def test_default_registry_exposes_builtin_classes():
+    registry = ProblemClassRegistry.default()
+
+    assert set(registry.all()) == set(DIMENSIONS_BY_CLASS)
+    assert registry.schema_version == 1
+
+
+@pytest.mark.parametrize("name,dimensions_list", DIMENSIONS_BY_CLASS.items())
+def test_builtin_estimators_produce_token_estimates(name, dimensions_list):
+    problem_class = ProblemClassRegistry.default().get(name)
+
+    estimate = problem_class.estimate(dimensions_list[0])
+
+    assert isinstance(estimate, TokenEstimate)
+    assert estimate.prompt_tokens >= 0
+    assert estimate.completion_tokens >= 0
+    assert 0 <= estimate.confidence <= 1
+
+
+@pytest.mark.parametrize("name,dimensions_list", DIMENSIONS_BY_CLASS.items())
+def test_fit_recovers_seeded_params_from_synthetic_observations(name, dimensions_list):
+    seeded = ProblemClassRegistry.default().get(name)
+    param_name = seeded.tunable_params[0]
+    off_seed = type(seeded)(params={param_name: seeded.params[param_name] * 2})
+    observations = []
+    for dimensions in dimensions_list:
+        estimate = seeded.estimate(dimensions)
+        observations.append(
+            Observation(
+                dimensions=dimensions,
+                prompt_tokens=estimate.prompt_tokens,
+                completion_tokens=estimate.completion_tokens,
+            )
+        )
+
+    fitted = off_seed.fit(observations, min_observations=3)
+
+    assert fitted.params[param_name] == pytest.approx(seeded.params[param_name], rel=0.1)
+
+
+def test_fit_uses_quality_ledger_observation_shape():
+    problem_class = EntityExtractionProblemClass(params={"tokens_per_entity": 10})
+    observations = [
+        QualityObservation(
+            task_type="extract",
+            adapter_id="openrouter",
+            model_id="openai/gpt-4o-mini",
+            cost_usd=0.001,
+            quality_score=0.9,
+            latency_ms=100,
+            tokens_in=500,
+            tokens_out=350,
+            recorded_at=datetime(2026, 5, 19, tzinfo=timezone.utc),
+            tags={
+                "problem_class": "entity-extraction",
+                "dimensions": {
+                    "chunk_words": 300,
+                    "template_words": 100,
+                    "expected_entities": 5,
+                },
+            },
+        )
+        for _ in range(3)
+    ]
+
+    fitted = problem_class.fit(observations)
+
+    assert fitted.params["tokens_per_entity"] == pytest.approx(70)
+
+
+def test_fit_keeps_seed_when_sample_is_too_small():
+    problem_class = EntityExtractionProblemClass()
+    estimate = problem_class.estimate(
+        {"chunk_words": 300, "template_words": 100, "expected_entities": 5}
+    )
+
+    fitted = problem_class.fit(
+        [
+            Observation(
+                dimensions={"chunk_words": 300, "template_words": 100, "expected_entities": 5},
+                prompt_tokens=estimate.prompt_tokens,
+                completion_tokens=estimate.completion_tokens,
+            )
+        ],
+        min_observations=3,
+    )
+
+    assert fitted is problem_class
+
+
+def test_missing_dimensions_are_rejected():
+    problem_class = ProblemClassRegistry.default().get("chunk-summarization")
+
+    with pytest.raises(ValueError, match="Missing dimensions"):
+        problem_class.estimate({"chunk_words": 100})
--- a/tests/test_rates.py
+++ b/tests/test_rates.py
@@ -0,0 +1,65 @@
+import pytest
+
+from llm_connect.rates import ModelRate, ModelRateRegistry
+
+
+def test_default_registry_contains_openrouter_seed_models():
+    registry = ModelRateRegistry.default()
+    rates = registry.all()
+
+    assert len(rates) >= 9
+    assert rates["openai/gpt-4o-mini"].captured_at == "2026-05-17"
+    assert rates["openai/gpt-4o-mini"].source_url == "https://openrouter.ai/models"
+
+
+def test_from_yaml_loads_package_shape(tmp_path):
+    path = tmp_path / "model-rates.yaml"
+    path.write_text(
+        """
+schema_version: 1
+currency: USD
+source_url: https://example.test/rates
+captured_at: "2026-05-19"
+rates:
+  vendor/model:
+    prompt_per_1k: 0.1
+    completion_per_1k: 0.2
+""",
+        encoding="utf-8",
+    )
+
+    registry = ModelRateRegistry.from_yaml(path)
+    rate = registry.get("vendor/model")
+
+    assert rate == ModelRate(
+        model_id="vendor/model",
+        prompt_per_1k=0.1,
+        completion_per_1k=0.2,
+        currency="USD",
+        source_url="https://example.test/rates",
+        captured_at="2026-05-19",
+    )
+
+
+def test_merged_with_overrides_matching_model():
+    base = ModelRateRegistry.default()
+    override = ModelRateRegistry(
+        {
+            "openai/gpt-4o-mini": ModelRate(
+                "openai/gpt-4o-mini",
+                prompt_per_1k=1,
+                completion_per_1k=2,
+                captured_at="override",
+            )
+        }
+    )
+
+    merged = base.merged_with(override)
+
+    assert merged.get("openai/gpt-4o-mini").prompt_per_1k == 1
+    assert merged.get("openai/gpt-4o-mini").captured_at == "override"
+
+
+def test_negative_rates_are_rejected():
+    with pytest.raises(ValueError, match="prompt_per_1k"):
+        ModelRate("bad/model", prompt_per_1k=-1, completion_per_1k=0)