Files
llm-connect/llm_connect/similarity.py
tegwick e499edba90 feat: initial llm-connect package scaffold
Copy markitect.llm module into standalone llm_connect package.
All markitect.* imports replaced with llm_connect.* equivalents.
LLMError base class inlined (no markitect.exceptions dependency).
Verified: from llm_connect import create_adapter works.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-27 07:54:42 +01:00

65 lines
2.0 KiB
Python

"""
Pure-Python vector similarity utilities.
No external dependencies — uses :mod:`math` only. Sufficient for the
current entity scale (~100s). numpy can be substituted later if needed.
"""
import math
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Cosine similarity between two vectors.
Returns a float in [-1, 1]. Returns 0.0 if either vector has
zero magnitude (to avoid division by zero).
"""
dot = sum(x * y for x, y in zip(a, b))
mag_a = math.sqrt(sum(x * x for x in a))
mag_b = math.sqrt(sum(x * x for x in b))
if mag_a == 0.0 or mag_b == 0.0:
return 0.0
return dot / (mag_a * mag_b)
def similarity_matrix(embeddings: list[list[float]]) -> list[list[float]]:
"""Build an NxN cosine similarity matrix.
``matrix[i][j]`` is the cosine similarity between
``embeddings[i]`` and ``embeddings[j]``.
"""
n = len(embeddings)
mat: list[list[float]] = [[0.0] * n for _ in range(n)]
for i in range(n):
mat[i][i] = 1.0
for j in range(i + 1, n):
sim = cosine_similarity(embeddings[i], embeddings[j])
mat[i][j] = sim
mat[j][i] = sim
return mat
def find_similar_pairs(
embeddings: dict[str, list[float]],
threshold: float = 0.80,
) -> list[tuple[str, str, float]]:
"""Find all pairs with cosine similarity >= *threshold*.
Args:
embeddings: Mapping of slug → embedding vector.
threshold: Minimum similarity to include (default 0.80).
Returns:
List of ``(slug_a, slug_b, similarity)`` tuples sorted by
similarity descending.
"""
slugs = sorted(embeddings)
pairs: list[tuple[str, str, float]] = []
for i, slug_a in enumerate(slugs):
for slug_b in slugs[i + 1:]:
sim = cosine_similarity(embeddings[slug_a], embeddings[slug_b])
if sim >= threshold:
pairs.append((slug_a, slug_b, sim))
pairs.sort(key=lambda t: t[2], reverse=True)
return pairs