optional semantic retrieval

This commit is contained in:
2026-04-26 16:05:27 +02:00
parent 7c3cd2ab63
commit 1bac1832f0
11 changed files with 453 additions and 3 deletions

View File

@@ -0,0 +1,44 @@
# Semantic Retrieval Notes
T02 introduces semantic retrieval as an optional layer above the existing SQLite
text search. The default service path remains text-only so existing callers keep
stable result sets and ordering.
## Local provider
`HashingEmbeddingProvider` is the offline provider used for tests and local
development. It produces deterministic token-bucket vectors without any network
dependency. Configure it with:
```bash
REPO_REGISTRY_EMBEDDING_PROVIDER=hashing
```
When enabled, search combines:
- text match score from the existing SQLite search path
- vector score from approved ability/capability entries and content chunks
- approved confidence as a small ranking prior
## PostgreSQL / pgvector path
SQLite dev mode should remain the lowest-friction path. A production PostgreSQL
deployment can add pgvector without changing the registry API by introducing an
embedding table keyed by source entity:
```sql
CREATE TABLE registry_embeddings (
id bigserial PRIMARY KEY,
repository_id bigint NOT NULL,
source_table text NOT NULL,
source_id bigint NOT NULL,
provider text NOT NULL,
vector vector(768) NOT NULL,
updated_at timestamptz NOT NULL DEFAULT now(),
UNIQUE (source_table, source_id, provider)
);
```
The search service can then replace runtime embedding of stored text with indexed
nearest-neighbor lookup, while retaining the current hybrid rank formula and the
same response schema.

View File

@@ -245,6 +245,9 @@ class SearchResult:
capability_name: str | None = None
evidence_level: str | None = None
source_reference: str | None = None
text_score: float = 0.0
vector_score: float = 0.0
hybrid_score: float = 0.0
@dataclass(frozen=True)

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
from collections.abc import Sequence
from dataclasses import asdict
from dataclasses import asdict, replace
from repo_registry.core.models import (
AbilitySummary,
@@ -30,6 +30,7 @@ from repo_registry.llm_extraction.mapper import LLMExtractionMapper
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.repo_ingestion.metadata import RepositoryMetadataExtractor
from repo_registry.repo_scanning.scanner import DeterministicScanner
from repo_registry.semantic import EmbeddingProvider, cosine_similarity
from repo_registry.storage.sqlite import RegistryStore
@@ -41,6 +42,7 @@ class RegistryService:
store: RegistryStore,
ingestion: GitIngestionService | None = None,
llm_extractor: LLMCandidateExtractor | None = None,
embedding_provider: EmbeddingProvider | None = None,
) -> None:
self.store = store
self.scanner = DeterministicScanner()
@@ -50,6 +52,7 @@ class RegistryService:
self.content_extractor = ContentExtractor()
self.llm_extractor = llm_extractor
self.llm_mapper = LLMExtractionMapper()
self.embedding_provider = embedding_provider
def register_repository(
self,
@@ -1319,7 +1322,7 @@ class RegistryService:
ability: str | None = None,
capability: str | None = None,
) -> list[SearchResult]:
return self.store.search(
text_results = self.store.search(
query,
status=status,
language=language,
@@ -1327,3 +1330,241 @@ class RegistryService:
ability=ability,
capability=capability,
)
if self.embedding_provider is None:
return text_results
return self._hybrid_search(
query,
text_results,
status=status,
language=language,
framework=framework,
ability=ability,
capability=capability,
)
def _hybrid_search(
self,
query: str,
text_results: list[SearchResult],
*,
status: str | None,
language: str | None,
framework: str | None,
ability: str | None,
capability: str | None,
) -> list[SearchResult]:
query_vector = self.embedding_provider.embed(query)
candidates = self._semantic_candidates(
status=status,
language=language,
framework=framework,
ability=ability,
capability=capability,
)
by_key = {
self._search_result_key(result): replace(
result,
text_score=max(result.text_score, 1.0),
hybrid_score=max(result.hybrid_score, result.confidence),
)
for result in text_results
}
for text, result in candidates:
vector_score = max(
0.0,
cosine_similarity(query_vector, self.embedding_provider.embed(text)),
)
if vector_score < 0.18:
continue
text_match = by_key.get(self._search_result_key(result))
text_score = text_match.text_score if text_match is not None else 0.0
hybrid_score = (
0.55 * text_score
+ 0.35 * vector_score
+ 0.10 * result.confidence
)
ranked = replace(
text_match or result,
vector_score=max(vector_score, (text_match or result).vector_score),
text_score=text_score,
hybrid_score=max(hybrid_score, (text_match or result).hybrid_score),
matched_field=(text_match or result).matched_field or "semantic",
)
by_key[self._search_result_key(ranked)] = ranked
return sorted(
by_key.values(),
key=lambda result: (
-result.hybrid_score,
-result.vector_score,
-result.confidence,
result.repository_name.lower(),
result.match_type,
result.match_name.lower(),
),
)
def _semantic_candidates(
self,
*,
status: str | None,
language: str | None,
framework: str | None,
ability: str | None,
capability: str | None,
) -> list[tuple[str, SearchResult]]:
candidates: list[tuple[str, SearchResult]] = []
for repository in self.store.list_repositories():
if status and repository.status != status:
continue
facts = self.store.list_observed_facts(repository.id)
if not self._repository_matches_observed_filter(facts, "language", language):
continue
if not self._repository_matches_observed_filter(facts, "framework", framework):
continue
ability_map = self.store.get_ability_map(repository.id)
if not self._ability_map_matches_filter(
ability_map,
ability=ability,
capability=capability,
):
continue
candidates.extend(self._approved_entry_candidates(ability_map))
candidates.extend(self._content_chunk_candidates(repository, ability_map))
return candidates
def _approved_entry_candidates(
self,
ability_map: RepositoryAbilityMap,
) -> list[tuple[str, SearchResult]]:
candidates: list[tuple[str, SearchResult]] = []
repository = ability_map.repository
for ability in ability_map.abilities:
ability_text = f"{ability.name} {ability.description}"
candidates.append(
(
ability_text,
SearchResult(
repository_id=repository.id,
repository_name=repository.name,
match_type="ability",
match_name=ability.name,
confidence=ability.confidence,
confidence_label=ability.confidence_label,
match_description=ability.description,
matched_field="semantic",
ability_id=ability.id,
ability_name=ability.name,
),
)
)
for capability in ability.capabilities:
capability_text = " ".join(
[
ability.name,
capability.name,
capability.description,
" ".join(capability.inputs),
" ".join(capability.outputs),
]
)
candidates.append(
(
capability_text,
SearchResult(
repository_id=repository.id,
repository_name=repository.name,
match_type="capability",
match_name=capability.name,
confidence=capability.confidence,
confidence_label=capability.confidence_label,
match_description=capability.description,
matched_field="semantic",
ability_id=ability.id,
ability_name=ability.name,
capability_id=capability.id,
capability_name=capability.name,
),
)
)
return candidates
def _content_chunk_candidates(
self,
repository: Repository,
ability_map: RepositoryAbilityMap,
) -> list[tuple[str, SearchResult]]:
if not ability_map.abilities:
return []
chunks = self.store.list_content_chunks(repository.id)
candidates: list[tuple[str, SearchResult]] = []
for chunk in chunks:
candidates.append(
(
chunk.text,
SearchResult(
repository_id=repository.id,
repository_name=repository.name,
match_type="content_chunk",
match_name=f"{chunk.path}:{chunk.start_line}-{chunk.end_line}",
confidence=0.5,
confidence_label="medium",
match_description=chunk.text[:240],
matched_field="semantic",
source_reference=f"{chunk.path}:{chunk.start_line}",
),
)
)
return candidates
def _repository_matches_observed_filter(
self,
facts: Sequence[ObservedFact],
kind: str,
expected: str | None,
) -> bool:
if not expected:
return True
expected_lower = expected.lower()
return any(
fact.kind == kind and expected_lower in fact.name.lower()
for fact in facts
)
def _ability_map_matches_filter(
self,
ability_map: RepositoryAbilityMap,
*,
ability: str | None,
capability: str | None,
) -> bool:
if not ability and not capability:
return True
ability_lower = ability.lower() if ability else None
capability_lower = capability.lower() if capability else None
for approved_ability in ability_map.abilities:
ability_matches = (
ability_lower is None
or ability_lower in approved_ability.name.lower()
or ability_lower in approved_ability.description.lower()
)
if not ability_matches:
continue
if capability_lower is None:
return True
for approved_capability in approved_ability.capabilities:
if (
capability_lower in approved_capability.name.lower()
or capability_lower in approved_capability.description.lower()
):
return True
return False
def _search_result_key(self, result: SearchResult) -> tuple[object, ...]:
return (
result.repository_id,
result.match_type,
result.ability_id,
result.capability_id,
result.match_name,
result.source_reference,
)

View File

@@ -0,0 +1,11 @@
from repo_registry.semantic.embeddings import (
EmbeddingProvider,
HashingEmbeddingProvider,
cosine_similarity,
)
__all__ = [
"EmbeddingProvider",
"HashingEmbeddingProvider",
"cosine_similarity",
]

View File

@@ -0,0 +1,58 @@
from __future__ import annotations
import hashlib
import math
import re
from typing import Protocol
class EmbeddingProvider(Protocol):
name: str
def embed(self, text: str) -> list[float]:
"""Return a deterministic vector for the supplied text."""
class HashingEmbeddingProvider:
"""Offline test provider using hashed token buckets.
This is intentionally simple: it gives tests and local development a stable
semantic path without depending on an external model service.
"""
name = "hashing-v1"
def __init__(self, dimensions: int = 64) -> None:
self.dimensions = dimensions
def embed(self, text: str) -> list[float]:
vector = [0.0] * self.dimensions
for token in _tokens(text):
digest = hashlib.sha256(token.encode("utf-8")).digest()
index = int.from_bytes(digest[:2], "big") % self.dimensions
sign = 1.0 if digest[2] % 2 == 0 else -1.0
vector[index] += sign
norm = math.sqrt(sum(value * value for value in vector))
if norm == 0:
return vector
return [value / norm for value in vector]
def cosine_similarity(left: list[float], right: list[float]) -> float:
if not left or not right or len(left) != len(right):
return 0.0
return sum(a * b for a, b in zip(left, right, strict=True))
def _tokens(text: str) -> list[str]:
tokens = []
for token in re.findall(r"[A-Za-z0-9]+", text.lower()):
tokens.append(_stem(token))
return tokens
def _stem(token: str) -> str:
for suffix in ("ing", "ed", "es", "s"):
if len(token) > len(suffix) + 3 and token.endswith(suffix):
return token[: -len(suffix)]
return token

View File

@@ -1812,6 +1812,8 @@ class RegistryStore:
confidence_label=confidence_label(1.0),
match_description=row["description"] or "",
matched_field=matched_field,
text_score=1.0,
hybrid_score=1.0,
)
)
for row in ability_rows:
@@ -1830,6 +1832,8 @@ class RegistryStore:
matched_field=matched_field,
ability_id=row["ability_id"],
ability_name=row["ability_name"],
text_score=1.0,
hybrid_score=row["confidence"],
)
)
for row in capability_rows:
@@ -1852,6 +1856,8 @@ class RegistryStore:
ability_name=row["ability_name"],
capability_id=row["capability_id"],
capability_name=row["capability_name"],
text_score=1.0,
hybrid_score=row["confidence"],
)
)
for row in feature_rows:
@@ -1878,6 +1884,8 @@ class RegistryStore:
capability_id=row["capability_id"],
capability_name=row["capability_name"],
source_reference=row["location"],
text_score=1.0,
hybrid_score=row["confidence"],
)
)
for row in evidence_rows:
@@ -1907,11 +1915,14 @@ class RegistryStore:
capability_name=row["capability_name"],
evidence_level=row["strength"],
source_reference=row["reference"],
text_score=1.0,
hybrid_score=self._evidence_confidence(row["strength"]),
)
)
return sorted(
results,
key=lambda result: (
-result.hybrid_score,
-result.confidence,
result.repository_name.lower(),
result.match_type,

View File

@@ -11,6 +11,7 @@ from pydantic_settings import BaseSettings, SettingsConfigDict
from repo_registry.core.service import RegistryService
from repo_registry.llm_extraction import LLMCandidateExtractor, create_llm_connect_adapter
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.semantic import HashingEmbeddingProvider
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
from repo_registry.web_api.schemas import (
AbilityCreate,
@@ -60,6 +61,7 @@ class Settings(BaseSettings):
checkout_root: str = Field(default="var/checkouts")
llm_provider: str | None = Field(default=None)
llm_model: str | None = Field(default=None)
embedding_provider: str | None = Field(default=None)
def get_settings() -> Settings:
@@ -78,10 +80,14 @@ def get_service(settings: Settings = Depends(get_settings)) -> RegistryService:
model=settings.llm_model,
)
llm_extractor = LLMCandidateExtractor(adapter)
embedding_provider = None
if settings.embedding_provider == "hashing":
embedding_provider = HashingEmbeddingProvider()
return RegistryService(
store,
ingestion=GitIngestionService(settings.checkout_root),
llm_extractor=llm_extractor,
embedding_provider=embedding_provider,
)

View File

@@ -654,6 +654,9 @@ class SearchResultResponse(BaseModel):
capability_name: str | None = None
evidence_level: str | None = None
source_reference: str | None = None
text_score: float = 0.0
vector_score: float = 0.0
hybrid_score: float = 0.0
model_config = {
"json_schema_extra": {
@@ -673,6 +676,9 @@ class SearchResultResponse(BaseModel):
"capability_name": "Classify Incoming Email",
"evidence_level": None,
"source_reference": None,
"text_score": 1.0,
"vector_score": 0.0,
"hybrid_score": 0.88,
}
]
}

View File

@@ -3,6 +3,7 @@ import subprocess
from repo_registry.core.service import RegistryService
from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.semantic import HashingEmbeddingProvider
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
@@ -290,6 +291,60 @@ def test_search_filters_by_status_language_and_framework(tmp_path):
assert wrong_capability_results == []
def test_semantic_search_adds_hybrid_matches_without_changing_text_default(tmp_path):
source = tmp_path / "repo"
source.mkdir()
(source / "README.md").write_text(
"# Queue Worker\n\nHandles postponed customer jobs.\n",
encoding="utf-8",
)
text_service = make_service(tmp_path)
repository = text_service.register_repository(
name="Queue Worker",
url=str(source),
description="Processes deferred customer work.",
)
ability_id = text_service.add_ability(
repository.id,
name="Background Job Processing",
description="Run deferred work outside request handling.",
confidence=0.8,
)
capability_id = text_service.add_capability(
repository.id,
ability_id,
name="Process Customer Tasks",
description="Execute queued customer tasks asynchronously.",
confidence=0.7,
)
text_service.add_feature(
repository.id,
capability_id,
name="worker task loop",
type="background worker",
location="worker.py",
confidence=0.6,
)
text_service.analyze_repository(repository.id)
assert text_service.search("customer queued") == []
semantic_service = RegistryService(
text_service.store,
ingestion=GitIngestionService(tmp_path / "checkouts"),
embedding_provider=HashingEmbeddingProvider(),
)
results = semantic_service.search("customer queued")
assert results
assert results[0].match_type in {"capability", "content_chunk"}
assert results[0].matched_field == "semantic"
assert results[0].vector_score > 0
assert results[0].hybrid_score >= results[0].vector_score * 0.35
assert any(result.match_type == "content_chunk" for result in results)
def test_register_repository_imports_metadata_when_name_is_omitted(tmp_path):
source = tmp_path / "metadata-source"
source.mkdir()

View File

@@ -410,11 +410,25 @@ def test_api_service_settings_can_enable_llm_extractor(monkeypatch, tmp_path):
assert service.llm_extractor is not None
def test_api_service_settings_can_enable_hashing_embedding_provider(tmp_path):
service = get_service(
Settings(
database_path=str(tmp_path / "embedding-settings.sqlite3"),
checkout_root=str(tmp_path / "checkouts"),
embedding_provider="hashing",
)
)
assert service.embedding_provider is not None
assert service.embedding_provider.name == "hashing-v1"
def test_settings_can_load_from_environment(monkeypatch):
monkeypatch.setenv("REPO_REGISTRY_DATABASE_PATH", "var/env.sqlite3")
monkeypatch.setenv("REPO_REGISTRY_CHECKOUT_ROOT", "var/env-checkouts")
monkeypatch.setenv("REPO_REGISTRY_LLM_PROVIDER", "mock")
monkeypatch.setenv("REPO_REGISTRY_LLM_MODEL", "demo-model")
monkeypatch.setenv("REPO_REGISTRY_EMBEDDING_PROVIDER", "hashing")
settings = Settings()
@@ -422,6 +436,7 @@ def test_settings_can_load_from_environment(monkeypatch):
assert settings.checkout_root == "var/env-checkouts"
assert settings.llm_provider == "mock"
assert settings.llm_model == "demo-model"
assert settings.embedding_provider == "hashing"
def test_api_analysis_run_loop(tmp_path):

View File

@@ -42,7 +42,7 @@ Candidate endpoints:
```task
id: RREG-WP-0002-T02
status: todo
status: done
priority: medium
state_hub_task_id: "0e7cce78-13ab-4aa2-8d25-ae50ff8ccd74"
```