generated from coulomb/repo-seed
chore(consistency): sync task status from DB [auto]
Updated by fix-consistency on 2026-05-15: - update .custodian-brief.md for repo-scoping
This commit is contained in:
11
src/repo_scoping/semantic/__init__.py
Normal file
11
src/repo_scoping/semantic/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from repo_registry.semantic.embeddings import (
|
||||
EmbeddingProvider,
|
||||
HashingEmbeddingProvider,
|
||||
cosine_similarity,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"EmbeddingProvider",
|
||||
"HashingEmbeddingProvider",
|
||||
"cosine_similarity",
|
||||
]
|
||||
58
src/repo_scoping/semantic/embeddings.py
Normal file
58
src/repo_scoping/semantic/embeddings.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import math
|
||||
import re
|
||||
from typing import Protocol
|
||||
|
||||
|
||||
class EmbeddingProvider(Protocol):
|
||||
name: str
|
||||
|
||||
def embed(self, text: str) -> list[float]:
|
||||
"""Return a deterministic vector for the supplied text."""
|
||||
|
||||
|
||||
class HashingEmbeddingProvider:
|
||||
"""Offline test provider using hashed token buckets.
|
||||
|
||||
This is intentionally simple: it gives tests and local development a stable
|
||||
semantic path without depending on an external model service.
|
||||
"""
|
||||
|
||||
name = "hashing-v1"
|
||||
|
||||
def __init__(self, dimensions: int = 64) -> None:
|
||||
self.dimensions = dimensions
|
||||
|
||||
def embed(self, text: str) -> list[float]:
|
||||
vector = [0.0] * self.dimensions
|
||||
for token in _tokens(text):
|
||||
digest = hashlib.sha256(token.encode("utf-8")).digest()
|
||||
index = int.from_bytes(digest[:2], "big") % self.dimensions
|
||||
sign = 1.0 if digest[2] % 2 == 0 else -1.0
|
||||
vector[index] += sign
|
||||
norm = math.sqrt(sum(value * value for value in vector))
|
||||
if norm == 0:
|
||||
return vector
|
||||
return [value / norm for value in vector]
|
||||
|
||||
|
||||
def cosine_similarity(left: list[float], right: list[float]) -> float:
|
||||
if not left or not right or len(left) != len(right):
|
||||
return 0.0
|
||||
return sum(a * b for a, b in zip(left, right, strict=True))
|
||||
|
||||
|
||||
def _tokens(text: str) -> list[str]:
|
||||
tokens = []
|
||||
for token in re.findall(r"[A-Za-z0-9]+", text.lower()):
|
||||
tokens.append(_stem(token))
|
||||
return tokens
|
||||
|
||||
|
||||
def _stem(token: str) -> str:
|
||||
for suffix in ("ing", "ed", "es", "s"):
|
||||
if len(token) > len(suffix) + 3 and token.endswith(suffix):
|
||||
return token[: -len(suffix)]
|
||||
return token
|
||||
Reference in New Issue
Block a user