feat(llm): add embedding adapter with cache and similarity utils (S1.3)
Add OpenAI-compatible embedding support (works with both OpenAI and OpenRouter), file-based embedding cache with content-digest invalidation, and pure-Python cosine similarity utilities for downstream redundancy detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
64
markitect/llm/similarity.py
Normal file
64
markitect/llm/similarity.py
Normal file
@@ -0,0 +1,64 @@
|
||||
"""
|
||||
Pure-Python vector similarity utilities.
|
||||
|
||||
No external dependencies — uses :mod:`math` only. Sufficient for the
|
||||
current entity scale (~100s). numpy can be substituted later if needed.
|
||||
"""
|
||||
|
||||
import math
|
||||
|
||||
|
||||
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
||||
"""Cosine similarity between two vectors.
|
||||
|
||||
Returns a float in [-1, 1]. Returns 0.0 if either vector has
|
||||
zero magnitude (to avoid division by zero).
|
||||
"""
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
mag_a = math.sqrt(sum(x * x for x in a))
|
||||
mag_b = math.sqrt(sum(x * x for x in b))
|
||||
if mag_a == 0.0 or mag_b == 0.0:
|
||||
return 0.0
|
||||
return dot / (mag_a * mag_b)
|
||||
|
||||
|
||||
def similarity_matrix(embeddings: list[list[float]]) -> list[list[float]]:
|
||||
"""Build an NxN cosine similarity matrix.
|
||||
|
||||
``matrix[i][j]`` is the cosine similarity between
|
||||
``embeddings[i]`` and ``embeddings[j]``.
|
||||
"""
|
||||
n = len(embeddings)
|
||||
mat: list[list[float]] = [[0.0] * n for _ in range(n)]
|
||||
for i in range(n):
|
||||
mat[i][i] = 1.0
|
||||
for j in range(i + 1, n):
|
||||
sim = cosine_similarity(embeddings[i], embeddings[j])
|
||||
mat[i][j] = sim
|
||||
mat[j][i] = sim
|
||||
return mat
|
||||
|
||||
|
||||
def find_similar_pairs(
|
||||
embeddings: dict[str, list[float]],
|
||||
threshold: float = 0.80,
|
||||
) -> list[tuple[str, str, float]]:
|
||||
"""Find all pairs with cosine similarity >= *threshold*.
|
||||
|
||||
Args:
|
||||
embeddings: Mapping of slug → embedding vector.
|
||||
threshold: Minimum similarity to include (default 0.80).
|
||||
|
||||
Returns:
|
||||
List of ``(slug_a, slug_b, similarity)`` tuples sorted by
|
||||
similarity descending.
|
||||
"""
|
||||
slugs = sorted(embeddings)
|
||||
pairs: list[tuple[str, str, float]] = []
|
||||
for i, slug_a in enumerate(slugs):
|
||||
for slug_b in slugs[i + 1:]:
|
||||
sim = cosine_similarity(embeddings[slug_a], embeddings[slug_b])
|
||||
if sim >= threshold:
|
||||
pairs.append((slug_a, slug_b, sim))
|
||||
pairs.sort(key=lambda t: t[2], reverse=True)
|
||||
return pairs
|
||||
Reference in New Issue
Block a user