"""Tests for embedding adapter, cache, similarity, and factory.""" from pathlib import Path from unittest import mock import pytest from markitect.llm.similarity import ( cosine_similarity, similarity_matrix, find_similar_pairs, ) from markitect.llm.embedding_cache import EmbeddingCache from markitect.llm.embedding_openai import OpenAICompatibleEmbeddingAdapter from markitect.llm.embedding_factory import create_embedding_adapter from markitect.llm.exceptions import LLMConfigurationError, LLMRateLimitError # ── Similarity math ───────────────────────────────────────────────── class TestCosineSimilarity: def test_identical_vectors(self): v = [1.0, 2.0, 3.0] assert cosine_similarity(v, v) == pytest.approx(1.0) def test_orthogonal_vectors(self): a = [1.0, 0.0, 0.0] b = [0.0, 1.0, 0.0] assert cosine_similarity(a, b) == pytest.approx(0.0) def test_opposite_vectors(self): a = [1.0, 0.0] b = [-1.0, 0.0] assert cosine_similarity(a, b) == pytest.approx(-1.0) def test_zero_vector(self): assert cosine_similarity([0.0, 0.0], [1.0, 2.0]) == 0.0 class TestSimilarityMatrix: def test_diagonal_is_one(self): vecs = [[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]] mat = similarity_matrix(vecs) for i in range(len(vecs)): assert mat[i][i] == pytest.approx(1.0) def test_symmetric(self): vecs = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]] mat = similarity_matrix(vecs) n = len(vecs) for i in range(n): for j in range(n): assert mat[i][j] == pytest.approx(mat[j][i]) class TestFindSimilarPairs: def test_threshold_filters(self): emb = { "a": [1.0, 0.0], "b": [0.0, 1.0], "c": [1.0, 0.01], # very similar to "a" } pairs = find_similar_pairs(emb, threshold=0.90) slugs_in_pairs = {(s1, s2) for s1, s2, _ in pairs} assert ("a", "c") in slugs_in_pairs # a-b are orthogonal, should not appear assert ("a", "b") not in slugs_in_pairs def test_sorted_descending(self): emb = { "x": [1.0, 0.0, 0.0], "y": [0.9, 0.1, 0.0], "z": [0.95, 0.05, 0.0], } pairs = find_similar_pairs(emb, threshold=0.0) sims = [s for _, _, s in pairs] assert sims == sorted(sims, reverse=True) def test_empty_embeddings(self): assert find_similar_pairs({}) == [] def test_single_embedding(self): assert find_similar_pairs({"only": [1.0, 0.0]}) == [] # ── Embedding cache ───────────────────────────────────────────────── class TestEmbeddingCache: def test_put_get_roundtrip(self, tmp_path: Path): cache = EmbeddingCache(tmp_path) cache.put("division-of-labour", "abc123", [0.1, 0.2, 0.3]) assert cache.get("division-of-labour", "abc123") == [0.1, 0.2, 0.3] def test_wrong_digest_returns_none(self, tmp_path: Path): cache = EmbeddingCache(tmp_path) cache.put("slug", "digest-v1", [1.0]) assert cache.get("slug", "digest-v2") is None def test_missing_slug_returns_none(self, tmp_path: Path): cache = EmbeddingCache(tmp_path) assert cache.get("nonexistent", "any") is None def test_save_load_persists(self, tmp_path: Path): cache = EmbeddingCache(tmp_path) cache.put("slug-a", "d1", [0.5, 0.6]) cache.save() cache2 = EmbeddingCache(tmp_path) assert cache2.get("slug-a", "d1") == [0.5, 0.6] def test_stats_tracks_hits_and_misses(self, tmp_path: Path): cache = EmbeddingCache(tmp_path) cache.put("s", "d", [1.0]) cache.get("s", "d") # hit cache.get("s", "wrong") # miss cache.get("missing", "x") # miss s = cache.stats() assert s["entries"] == 1 assert s["hits"] == 1 assert s["misses"] == 2 # ── Adapter (mocked HTTP) ────────────────────────────────────────── def _make_embedding_response(vectors): """Build a mock API response for the /embeddings endpoint.""" return { "data": [ {"embedding": vec, "index": i} for i, vec in enumerate(vectors) ], "usage": {"prompt_tokens": 5, "total_tokens": 5}, } class TestOpenAICompatibleEmbeddingAdapter: def _adapter(self, **kwargs): defaults = {"api_key": "sk-test", "provider": "openai"} defaults.update(kwargs) return OpenAICompatibleEmbeddingAdapter(**defaults) @mock.patch("markitect.llm.embedding_openai.post_json") def test_embed_returns_vectors_in_order(self, mock_post): # Return indices out of order to verify sorting mock_post.return_value = { "data": [ {"embedding": [0.2, 0.3], "index": 1}, {"embedding": [0.1, 0.2], "index": 0}, ], "usage": {}, } adapter = self._adapter() result = adapter.embed(["text1", "text2"]) assert result == [[0.1, 0.2], [0.2, 0.3]] @mock.patch("markitect.llm.embedding_openai.post_json") def test_embed_payload_structure(self, mock_post): mock_post.return_value = _make_embedding_response([[0.1]]) adapter = self._adapter(model="text-embedding-3-large") adapter.embed(["hello"]) call_args = mock_post.call_args url = call_args[0][0] payload = call_args[0][1] assert url == "https://api.openai.com/v1/embeddings" assert payload["model"] == "text-embedding-3-large" assert payload["input"] == ["hello"] def test_embed_raises_without_api_key(self): adapter = OpenAICompatibleEmbeddingAdapter(api_key=None, provider="openai") adapter._api_key = None with pytest.raises(LLMConfigurationError): adapter.embed(["test"]) def test_validate_true_with_key(self): adapter = self._adapter() assert adapter.validate() is True def test_validate_false_without_key(self): adapter = OpenAICompatibleEmbeddingAdapter(api_key=None, provider="openai") adapter._api_key = None assert adapter.validate() is False @mock.patch("markitect.llm.embedding_openai.post_json") @mock.patch("markitect.llm.embedding_openai.time.sleep") def test_retry_on_429(self, mock_sleep, mock_post): mock_post.side_effect = [ LLMRateLimitError("rate limited", status_code=429), _make_embedding_response([[0.1, 0.2]]), ] adapter = self._adapter(max_retries=2) result = adapter.embed(["test"]) assert result == [[0.1, 0.2]] assert mock_sleep.call_count == 1 def test_openai_provider_base_url(self): adapter = self._adapter(provider="openai") assert adapter._api_base == "https://api.openai.com/v1" def test_openrouter_provider_base_url(self): adapter = self._adapter(provider="openrouter") assert adapter._api_base == "https://openrouter.ai/api/v1" def test_unknown_provider_raises(self): with pytest.raises(LLMConfigurationError): OpenAICompatibleEmbeddingAdapter(api_key="sk-test", provider="unknown") # ── Factory ───────────────────────────────────────────────────────── class TestCreateEmbeddingAdapter: def test_openai_provider(self): adapter = create_embedding_adapter("openai", api_key="sk-test") assert isinstance(adapter, OpenAICompatibleEmbeddingAdapter) assert adapter._provider == "openai" def test_openrouter_provider(self): adapter = create_embedding_adapter("openrouter", api_key="sk-test") assert isinstance(adapter, OpenAICompatibleEmbeddingAdapter) assert adapter._provider == "openrouter" def test_unknown_provider_raises(self): with pytest.raises(LLMConfigurationError) as exc_info: create_embedding_adapter("unknown") assert "unknown" in str(exc_info.value) def test_model_passed_through(self): adapter = create_embedding_adapter( "openai", model="text-embedding-3-large", api_key="sk-test" ) assert adapter._model == "text-embedding-3-large"