generated from coulomb/repo-seed
59 lines
1.7 KiB
Python
59 lines
1.7 KiB
Python
from __future__ import annotations
|
|
|
|
import hashlib
|
|
import math
|
|
import re
|
|
from typing import Protocol
|
|
|
|
|
|
class EmbeddingProvider(Protocol):
|
|
name: str
|
|
|
|
def embed(self, text: str) -> list[float]:
|
|
"""Return a deterministic vector for the supplied text."""
|
|
|
|
|
|
class HashingEmbeddingProvider:
|
|
"""Offline test provider using hashed token buckets.
|
|
|
|
This is intentionally simple: it gives tests and local development a stable
|
|
semantic path without depending on an external model service.
|
|
"""
|
|
|
|
name = "hashing-v1"
|
|
|
|
def __init__(self, dimensions: int = 64) -> None:
|
|
self.dimensions = dimensions
|
|
|
|
def embed(self, text: str) -> list[float]:
|
|
vector = [0.0] * self.dimensions
|
|
for token in _tokens(text):
|
|
digest = hashlib.sha256(token.encode("utf-8")).digest()
|
|
index = int.from_bytes(digest[:2], "big") % self.dimensions
|
|
sign = 1.0 if digest[2] % 2 == 0 else -1.0
|
|
vector[index] += sign
|
|
norm = math.sqrt(sum(value * value for value in vector))
|
|
if norm == 0:
|
|
return vector
|
|
return [value / norm for value in vector]
|
|
|
|
|
|
def cosine_similarity(left: list[float], right: list[float]) -> float:
|
|
if not left or not right or len(left) != len(right):
|
|
return 0.0
|
|
return sum(a * b for a, b in zip(left, right, strict=True))
|
|
|
|
|
|
def _tokens(text: str) -> list[str]:
|
|
tokens = []
|
|
for token in re.findall(r"[A-Za-z0-9]+", text.lower()):
|
|
tokens.append(_stem(token))
|
|
return tokens
|
|
|
|
|
|
def _stem(token: str) -> str:
|
|
for suffix in ("ing", "ed", "es", "s"):
|
|
if len(token) > len(suffix) + 3 and token.endswith(suffix):
|
|
return token[: -len(suffix)]
|
|
return token
|