""" Pure-Python vector similarity utilities. No external dependencies — uses :mod:`math` only. Sufficient for the current entity scale (~100s). numpy can be substituted later if needed. """ import math def cosine_similarity(a: list[float], b: list[float]) -> float: """Cosine similarity between two vectors. Returns a float in [-1, 1]. Returns 0.0 if either vector has zero magnitude (to avoid division by zero). """ dot = sum(x * y for x, y in zip(a, b)) mag_a = math.sqrt(sum(x * x for x in a)) mag_b = math.sqrt(sum(x * x for x in b)) if mag_a == 0.0 or mag_b == 0.0: return 0.0 return dot / (mag_a * mag_b) def similarity_matrix(embeddings: list[list[float]]) -> list[list[float]]: """Build an NxN cosine similarity matrix. ``matrix[i][j]`` is the cosine similarity between ``embeddings[i]`` and ``embeddings[j]``. """ n = len(embeddings) mat: list[list[float]] = [[0.0] * n for _ in range(n)] for i in range(n): mat[i][i] = 1.0 for j in range(i + 1, n): sim = cosine_similarity(embeddings[i], embeddings[j]) mat[i][j] = sim mat[j][i] = sim return mat def find_similar_pairs( embeddings: dict[str, list[float]], threshold: float = 0.80, ) -> list[tuple[str, str, float]]: """Find all pairs with cosine similarity >= *threshold*. Args: embeddings: Mapping of slug → embedding vector. threshold: Minimum similarity to include (default 0.80). Returns: List of ``(slug_a, slug_b, similarity)`` tuples sorted by similarity descending. """ slugs = sorted(embeddings) pairs: list[tuple[str, str, float]] = [] for i, slug_a in enumerate(slugs): for slug_b in slugs[i + 1:]: sim = cosine_similarity(embeddings[slug_a], embeddings[slug_b]) if sim >= threshold: pairs.append((slug_a, slug_b, sim)) pairs.sort(key=lambda t: t[2], reverse=True) return pairs