Files
repo-scoping/src/repo_registry/semantic/embeddings.py
2026-04-26 16:05:27 +02:00

59 lines
1.7 KiB
Python

from __future__ import annotations
import hashlib
import math
import re
from typing import Protocol
class EmbeddingProvider(Protocol):
name: str
def embed(self, text: str) -> list[float]:
"""Return a deterministic vector for the supplied text."""
class HashingEmbeddingProvider:
"""Offline test provider using hashed token buckets.
This is intentionally simple: it gives tests and local development a stable
semantic path without depending on an external model service.
"""
name = "hashing-v1"
def __init__(self, dimensions: int = 64) -> None:
self.dimensions = dimensions
def embed(self, text: str) -> list[float]:
vector = [0.0] * self.dimensions
for token in _tokens(text):
digest = hashlib.sha256(token.encode("utf-8")).digest()
index = int.from_bytes(digest[:2], "big") % self.dimensions
sign = 1.0 if digest[2] % 2 == 0 else -1.0
vector[index] += sign
norm = math.sqrt(sum(value * value for value in vector))
if norm == 0:
return vector
return [value / norm for value in vector]
def cosine_similarity(left: list[float], right: list[float]) -> float:
if not left or not right or len(left) != len(right):
return 0.0
return sum(a * b for a, b in zip(left, right, strict=True))
def _tokens(text: str) -> list[str]:
tokens = []
for token in re.findall(r"[A-Za-z0-9]+", text.lower()):
tokens.append(_stem(token))
return tokens
def _stem(token: str) -> str:
for suffix in ("ing", "ed", "es", "s"):
if len(token) > len(suffix) + 3 and token.endswith(suffix):
return token[: -len(suffix)]
return token