Add OpenAI-compatible embedding support (works with both OpenAI and OpenRouter), file-based embedding cache with content-digest invalidation, and pure-Python cosine similarity utilities for downstream redundancy detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
236 lines
8.5 KiB
Python
236 lines
8.5 KiB
Python
"""Tests for embedding adapter, cache, similarity, and factory."""
|
|
|
|
from pathlib import Path
|
|
from unittest import mock
|
|
|
|
import pytest
|
|
|
|
from markitect.llm.similarity import (
|
|
cosine_similarity,
|
|
similarity_matrix,
|
|
find_similar_pairs,
|
|
)
|
|
from markitect.llm.embedding_cache import EmbeddingCache
|
|
from markitect.llm.embedding_openai import OpenAICompatibleEmbeddingAdapter
|
|
from markitect.llm.embedding_factory import create_embedding_adapter
|
|
from markitect.llm.exceptions import LLMConfigurationError, LLMRateLimitError
|
|
|
|
|
|
# ── Similarity math ─────────────────────────────────────────────────
|
|
|
|
|
|
class TestCosineSimilarity:
|
|
def test_identical_vectors(self):
|
|
v = [1.0, 2.0, 3.0]
|
|
assert cosine_similarity(v, v) == pytest.approx(1.0)
|
|
|
|
def test_orthogonal_vectors(self):
|
|
a = [1.0, 0.0, 0.0]
|
|
b = [0.0, 1.0, 0.0]
|
|
assert cosine_similarity(a, b) == pytest.approx(0.0)
|
|
|
|
def test_opposite_vectors(self):
|
|
a = [1.0, 0.0]
|
|
b = [-1.0, 0.0]
|
|
assert cosine_similarity(a, b) == pytest.approx(-1.0)
|
|
|
|
def test_zero_vector(self):
|
|
assert cosine_similarity([0.0, 0.0], [1.0, 2.0]) == 0.0
|
|
|
|
|
|
class TestSimilarityMatrix:
|
|
def test_diagonal_is_one(self):
|
|
vecs = [[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]]
|
|
mat = similarity_matrix(vecs)
|
|
for i in range(len(vecs)):
|
|
assert mat[i][i] == pytest.approx(1.0)
|
|
|
|
def test_symmetric(self):
|
|
vecs = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]
|
|
mat = similarity_matrix(vecs)
|
|
n = len(vecs)
|
|
for i in range(n):
|
|
for j in range(n):
|
|
assert mat[i][j] == pytest.approx(mat[j][i])
|
|
|
|
|
|
class TestFindSimilarPairs:
|
|
def test_threshold_filters(self):
|
|
emb = {
|
|
"a": [1.0, 0.0],
|
|
"b": [0.0, 1.0],
|
|
"c": [1.0, 0.01], # very similar to "a"
|
|
}
|
|
pairs = find_similar_pairs(emb, threshold=0.90)
|
|
slugs_in_pairs = {(s1, s2) for s1, s2, _ in pairs}
|
|
assert ("a", "c") in slugs_in_pairs
|
|
# a-b are orthogonal, should not appear
|
|
assert ("a", "b") not in slugs_in_pairs
|
|
|
|
def test_sorted_descending(self):
|
|
emb = {
|
|
"x": [1.0, 0.0, 0.0],
|
|
"y": [0.9, 0.1, 0.0],
|
|
"z": [0.95, 0.05, 0.0],
|
|
}
|
|
pairs = find_similar_pairs(emb, threshold=0.0)
|
|
sims = [s for _, _, s in pairs]
|
|
assert sims == sorted(sims, reverse=True)
|
|
|
|
def test_empty_embeddings(self):
|
|
assert find_similar_pairs({}) == []
|
|
|
|
def test_single_embedding(self):
|
|
assert find_similar_pairs({"only": [1.0, 0.0]}) == []
|
|
|
|
|
|
# ── Embedding cache ─────────────────────────────────────────────────
|
|
|
|
|
|
class TestEmbeddingCache:
|
|
def test_put_get_roundtrip(self, tmp_path: Path):
|
|
cache = EmbeddingCache(tmp_path)
|
|
cache.put("division-of-labour", "abc123", [0.1, 0.2, 0.3])
|
|
assert cache.get("division-of-labour", "abc123") == [0.1, 0.2, 0.3]
|
|
|
|
def test_wrong_digest_returns_none(self, tmp_path: Path):
|
|
cache = EmbeddingCache(tmp_path)
|
|
cache.put("slug", "digest-v1", [1.0])
|
|
assert cache.get("slug", "digest-v2") is None
|
|
|
|
def test_missing_slug_returns_none(self, tmp_path: Path):
|
|
cache = EmbeddingCache(tmp_path)
|
|
assert cache.get("nonexistent", "any") is None
|
|
|
|
def test_save_load_persists(self, tmp_path: Path):
|
|
cache = EmbeddingCache(tmp_path)
|
|
cache.put("slug-a", "d1", [0.5, 0.6])
|
|
cache.save()
|
|
|
|
cache2 = EmbeddingCache(tmp_path)
|
|
assert cache2.get("slug-a", "d1") == [0.5, 0.6]
|
|
|
|
def test_stats_tracks_hits_and_misses(self, tmp_path: Path):
|
|
cache = EmbeddingCache(tmp_path)
|
|
cache.put("s", "d", [1.0])
|
|
cache.get("s", "d") # hit
|
|
cache.get("s", "wrong") # miss
|
|
cache.get("missing", "x") # miss
|
|
s = cache.stats()
|
|
assert s["entries"] == 1
|
|
assert s["hits"] == 1
|
|
assert s["misses"] == 2
|
|
|
|
|
|
# ── Adapter (mocked HTTP) ──────────────────────────────────────────
|
|
|
|
|
|
def _make_embedding_response(vectors):
|
|
"""Build a mock API response for the /embeddings endpoint."""
|
|
return {
|
|
"data": [
|
|
{"embedding": vec, "index": i}
|
|
for i, vec in enumerate(vectors)
|
|
],
|
|
"usage": {"prompt_tokens": 5, "total_tokens": 5},
|
|
}
|
|
|
|
|
|
class TestOpenAICompatibleEmbeddingAdapter:
|
|
def _adapter(self, **kwargs):
|
|
defaults = {"api_key": "sk-test", "provider": "openai"}
|
|
defaults.update(kwargs)
|
|
return OpenAICompatibleEmbeddingAdapter(**defaults)
|
|
|
|
@mock.patch("markitect.llm.embedding_openai.post_json")
|
|
def test_embed_returns_vectors_in_order(self, mock_post):
|
|
# Return indices out of order to verify sorting
|
|
mock_post.return_value = {
|
|
"data": [
|
|
{"embedding": [0.2, 0.3], "index": 1},
|
|
{"embedding": [0.1, 0.2], "index": 0},
|
|
],
|
|
"usage": {},
|
|
}
|
|
adapter = self._adapter()
|
|
result = adapter.embed(["text1", "text2"])
|
|
assert result == [[0.1, 0.2], [0.2, 0.3]]
|
|
|
|
@mock.patch("markitect.llm.embedding_openai.post_json")
|
|
def test_embed_payload_structure(self, mock_post):
|
|
mock_post.return_value = _make_embedding_response([[0.1]])
|
|
adapter = self._adapter(model="text-embedding-3-large")
|
|
adapter.embed(["hello"])
|
|
|
|
call_args = mock_post.call_args
|
|
url = call_args[0][0]
|
|
payload = call_args[0][1]
|
|
assert url == "https://api.openai.com/v1/embeddings"
|
|
assert payload["model"] == "text-embedding-3-large"
|
|
assert payload["input"] == ["hello"]
|
|
|
|
def test_embed_raises_without_api_key(self):
|
|
adapter = OpenAICompatibleEmbeddingAdapter(api_key=None, provider="openai")
|
|
adapter._api_key = None
|
|
with pytest.raises(LLMConfigurationError):
|
|
adapter.embed(["test"])
|
|
|
|
def test_validate_true_with_key(self):
|
|
adapter = self._adapter()
|
|
assert adapter.validate() is True
|
|
|
|
def test_validate_false_without_key(self):
|
|
adapter = OpenAICompatibleEmbeddingAdapter(api_key=None, provider="openai")
|
|
adapter._api_key = None
|
|
assert adapter.validate() is False
|
|
|
|
@mock.patch("markitect.llm.embedding_openai.post_json")
|
|
@mock.patch("markitect.llm.embedding_openai.time.sleep")
|
|
def test_retry_on_429(self, mock_sleep, mock_post):
|
|
mock_post.side_effect = [
|
|
LLMRateLimitError("rate limited", status_code=429),
|
|
_make_embedding_response([[0.1, 0.2]]),
|
|
]
|
|
adapter = self._adapter(max_retries=2)
|
|
result = adapter.embed(["test"])
|
|
assert result == [[0.1, 0.2]]
|
|
assert mock_sleep.call_count == 1
|
|
|
|
def test_openai_provider_base_url(self):
|
|
adapter = self._adapter(provider="openai")
|
|
assert adapter._api_base == "https://api.openai.com/v1"
|
|
|
|
def test_openrouter_provider_base_url(self):
|
|
adapter = self._adapter(provider="openrouter")
|
|
assert adapter._api_base == "https://openrouter.ai/api/v1"
|
|
|
|
def test_unknown_provider_raises(self):
|
|
with pytest.raises(LLMConfigurationError):
|
|
OpenAICompatibleEmbeddingAdapter(api_key="sk-test", provider="unknown")
|
|
|
|
|
|
# ── Factory ─────────────────────────────────────────────────────────
|
|
|
|
|
|
class TestCreateEmbeddingAdapter:
|
|
def test_openai_provider(self):
|
|
adapter = create_embedding_adapter("openai", api_key="sk-test")
|
|
assert isinstance(adapter, OpenAICompatibleEmbeddingAdapter)
|
|
assert adapter._provider == "openai"
|
|
|
|
def test_openrouter_provider(self):
|
|
adapter = create_embedding_adapter("openrouter", api_key="sk-test")
|
|
assert isinstance(adapter, OpenAICompatibleEmbeddingAdapter)
|
|
assert adapter._provider == "openrouter"
|
|
|
|
def test_unknown_provider_raises(self):
|
|
with pytest.raises(LLMConfigurationError) as exc_info:
|
|
create_embedding_adapter("unknown")
|
|
assert "unknown" in str(exc_info.value)
|
|
|
|
def test_model_passed_through(self):
|
|
adapter = create_embedding_adapter(
|
|
"openai", model="text-embedding-3-large", api_key="sk-test"
|
|
)
|
|
assert adapter._model == "text-embedding-3-large"
|