Add OpenAI-compatible embedding support (works with both OpenAI and OpenRouter), file-based embedding cache with content-digest invalidation, and pure-Python cosine similarity utilities for downstream redundancy detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
65 lines
2.0 KiB
Python
65 lines
2.0 KiB
Python
"""
|
|
Pure-Python vector similarity utilities.
|
|
|
|
No external dependencies — uses :mod:`math` only. Sufficient for the
|
|
current entity scale (~100s). numpy can be substituted later if needed.
|
|
"""
|
|
|
|
import math
|
|
|
|
|
|
def cosine_similarity(a: list[float], b: list[float]) -> float:
|
|
"""Cosine similarity between two vectors.
|
|
|
|
Returns a float in [-1, 1]. Returns 0.0 if either vector has
|
|
zero magnitude (to avoid division by zero).
|
|
"""
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
mag_a = math.sqrt(sum(x * x for x in a))
|
|
mag_b = math.sqrt(sum(x * x for x in b))
|
|
if mag_a == 0.0 or mag_b == 0.0:
|
|
return 0.0
|
|
return dot / (mag_a * mag_b)
|
|
|
|
|
|
def similarity_matrix(embeddings: list[list[float]]) -> list[list[float]]:
|
|
"""Build an NxN cosine similarity matrix.
|
|
|
|
``matrix[i][j]`` is the cosine similarity between
|
|
``embeddings[i]`` and ``embeddings[j]``.
|
|
"""
|
|
n = len(embeddings)
|
|
mat: list[list[float]] = [[0.0] * n for _ in range(n)]
|
|
for i in range(n):
|
|
mat[i][i] = 1.0
|
|
for j in range(i + 1, n):
|
|
sim = cosine_similarity(embeddings[i], embeddings[j])
|
|
mat[i][j] = sim
|
|
mat[j][i] = sim
|
|
return mat
|
|
|
|
|
|
def find_similar_pairs(
|
|
embeddings: dict[str, list[float]],
|
|
threshold: float = 0.80,
|
|
) -> list[tuple[str, str, float]]:
|
|
"""Find all pairs with cosine similarity >= *threshold*.
|
|
|
|
Args:
|
|
embeddings: Mapping of slug → embedding vector.
|
|
threshold: Minimum similarity to include (default 0.80).
|
|
|
|
Returns:
|
|
List of ``(slug_a, slug_b, similarity)`` tuples sorted by
|
|
similarity descending.
|
|
"""
|
|
slugs = sorted(embeddings)
|
|
pairs: list[tuple[str, str, float]] = []
|
|
for i, slug_a in enumerate(slugs):
|
|
for slug_b in slugs[i + 1:]:
|
|
sim = cosine_similarity(embeddings[slug_a], embeddings[slug_b])
|
|
if sim >= threshold:
|
|
pairs.append((slug_a, slug_b, sim))
|
|
pairs.sort(key=lambda t: t[2], reverse=True)
|
|
return pairs
|