feat(llm): add embedding adapter with cache and similarity utils (S1.3)

Add OpenAI-compatible embedding support (works with both OpenAI and OpenRouter), file-based embedding cache with content-digest invalidation, and pure-Python cosine similarity utilities for downstream redundancy detection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-19 01:22:21 +01:00
parent 9031e1162c
commit 267368eb60
7 changed files with 588 additions and 0 deletions
--- a/markitect/llm/similarity.py
+++ b/markitect/llm/similarity.py
@@ -0,0 +1,64 @@
+"""
+Pure-Python vector similarity utilities.
+
+No external dependencies — uses :mod:`math` only.  Sufficient for the
+current entity scale (~100s).  numpy can be substituted later if needed.
+"""
+
+import math
+
+
+def cosine_similarity(a: list[float], b: list[float]) -> float:
+    """Cosine similarity between two vectors.
+
+    Returns a float in [-1, 1].  Returns 0.0 if either vector has
+    zero magnitude (to avoid division by zero).
+    """
+    dot = sum(x * y for x, y in zip(a, b))
+    mag_a = math.sqrt(sum(x * x for x in a))
+    mag_b = math.sqrt(sum(x * x for x in b))
+    if mag_a == 0.0 or mag_b == 0.0:
+        return 0.0
+    return dot / (mag_a * mag_b)
+
+
+def similarity_matrix(embeddings: list[list[float]]) -> list[list[float]]:
+    """Build an NxN cosine similarity matrix.
+
+    ``matrix[i][j]`` is the cosine similarity between
+    ``embeddings[i]`` and ``embeddings[j]``.
+    """
+    n = len(embeddings)
+    mat: list[list[float]] = [[0.0] * n for _ in range(n)]
+    for i in range(n):
+        mat[i][i] = 1.0
+        for j in range(i + 1, n):
+            sim = cosine_similarity(embeddings[i], embeddings[j])
+            mat[i][j] = sim
+            mat[j][i] = sim
+    return mat
+
+
+def find_similar_pairs(
+    embeddings: dict[str, list[float]],
+    threshold: float = 0.80,
+) -> list[tuple[str, str, float]]:
+    """Find all pairs with cosine similarity >= *threshold*.
+
+    Args:
+        embeddings: Mapping of slug → embedding vector.
+        threshold: Minimum similarity to include (default 0.80).
+
+    Returns:
+        List of ``(slug_a, slug_b, similarity)`` tuples sorted by
+        similarity descending.
+    """
+    slugs = sorted(embeddings)
+    pairs: list[tuple[str, str, float]] = []
+    for i, slug_a in enumerate(slugs):
+        for slug_b in slugs[i + 1:]:
+            sim = cosine_similarity(embeddings[slug_a], embeddings[slug_b])
+            if sim >= threshold:
+                pairs.append((slug_a, slug_b, sim))
+    pairs.sort(key=lambda t: t[2], reverse=True)
+    return pairs