From 1bac1832f06e6ac52a17144f9c37161b8aad5df6 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 26 Apr 2026 16:05:27 +0200 Subject: [PATCH] optional semantic retrieval --- docs/semantic-retrieval.md | 44 ++++ src/repo_registry/core/models.py | 3 + src/repo_registry/core/service.py | 245 +++++++++++++++++- src/repo_registry/semantic/__init__.py | 11 + src/repo_registry/semantic/embeddings.py | 58 +++++ src/repo_registry/storage/sqlite.py | 11 + src/repo_registry/web_api/app.py | 6 + src/repo_registry/web_api/schemas.py | 6 + tests/test_registry_service.py | 55 ++++ tests/test_web_api.py | 15 ++ .../RREG-WP-0002-production-hardening.md | 2 +- 11 files changed, 453 insertions(+), 3 deletions(-) create mode 100644 docs/semantic-retrieval.md create mode 100644 src/repo_registry/semantic/__init__.py create mode 100644 src/repo_registry/semantic/embeddings.py diff --git a/docs/semantic-retrieval.md b/docs/semantic-retrieval.md new file mode 100644 index 0000000..df439d3 --- /dev/null +++ b/docs/semantic-retrieval.md @@ -0,0 +1,44 @@ +# Semantic Retrieval Notes + +T02 introduces semantic retrieval as an optional layer above the existing SQLite +text search. The default service path remains text-only so existing callers keep +stable result sets and ordering. + +## Local provider + +`HashingEmbeddingProvider` is the offline provider used for tests and local +development. It produces deterministic token-bucket vectors without any network +dependency. Configure it with: + +```bash +REPO_REGISTRY_EMBEDDING_PROVIDER=hashing +``` + +When enabled, search combines: + +- text match score from the existing SQLite search path +- vector score from approved ability/capability entries and content chunks +- approved confidence as a small ranking prior + +## PostgreSQL / pgvector path + +SQLite dev mode should remain the lowest-friction path. A production PostgreSQL +deployment can add pgvector without changing the registry API by introducing an +embedding table keyed by source entity: + +```sql +CREATE TABLE registry_embeddings ( + id bigserial PRIMARY KEY, + repository_id bigint NOT NULL, + source_table text NOT NULL, + source_id bigint NOT NULL, + provider text NOT NULL, + vector vector(768) NOT NULL, + updated_at timestamptz NOT NULL DEFAULT now(), + UNIQUE (source_table, source_id, provider) +); +``` + +The search service can then replace runtime embedding of stored text with indexed +nearest-neighbor lookup, while retaining the current hybrid rank formula and the +same response schema. diff --git a/src/repo_registry/core/models.py b/src/repo_registry/core/models.py index 275f40a..8e0b5f9 100644 --- a/src/repo_registry/core/models.py +++ b/src/repo_registry/core/models.py @@ -245,6 +245,9 @@ class SearchResult: capability_name: str | None = None evidence_level: str | None = None source_reference: str | None = None + text_score: float = 0.0 + vector_score: float = 0.0 + hybrid_score: float = 0.0 @dataclass(frozen=True) diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 2f250b0..14b7f00 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections.abc import Sequence -from dataclasses import asdict +from dataclasses import asdict, replace from repo_registry.core.models import ( AbilitySummary, @@ -30,6 +30,7 @@ from repo_registry.llm_extraction.mapper import LLMExtractionMapper from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.repo_ingestion.metadata import RepositoryMetadataExtractor from repo_registry.repo_scanning.scanner import DeterministicScanner +from repo_registry.semantic import EmbeddingProvider, cosine_similarity from repo_registry.storage.sqlite import RegistryStore @@ -41,6 +42,7 @@ class RegistryService: store: RegistryStore, ingestion: GitIngestionService | None = None, llm_extractor: LLMCandidateExtractor | None = None, + embedding_provider: EmbeddingProvider | None = None, ) -> None: self.store = store self.scanner = DeterministicScanner() @@ -50,6 +52,7 @@ class RegistryService: self.content_extractor = ContentExtractor() self.llm_extractor = llm_extractor self.llm_mapper = LLMExtractionMapper() + self.embedding_provider = embedding_provider def register_repository( self, @@ -1319,7 +1322,7 @@ class RegistryService: ability: str | None = None, capability: str | None = None, ) -> list[SearchResult]: - return self.store.search( + text_results = self.store.search( query, status=status, language=language, @@ -1327,3 +1330,241 @@ class RegistryService: ability=ability, capability=capability, ) + if self.embedding_provider is None: + return text_results + return self._hybrid_search( + query, + text_results, + status=status, + language=language, + framework=framework, + ability=ability, + capability=capability, + ) + + def _hybrid_search( + self, + query: str, + text_results: list[SearchResult], + *, + status: str | None, + language: str | None, + framework: str | None, + ability: str | None, + capability: str | None, + ) -> list[SearchResult]: + query_vector = self.embedding_provider.embed(query) + candidates = self._semantic_candidates( + status=status, + language=language, + framework=framework, + ability=ability, + capability=capability, + ) + by_key = { + self._search_result_key(result): replace( + result, + text_score=max(result.text_score, 1.0), + hybrid_score=max(result.hybrid_score, result.confidence), + ) + for result in text_results + } + for text, result in candidates: + vector_score = max( + 0.0, + cosine_similarity(query_vector, self.embedding_provider.embed(text)), + ) + if vector_score < 0.18: + continue + text_match = by_key.get(self._search_result_key(result)) + text_score = text_match.text_score if text_match is not None else 0.0 + hybrid_score = ( + 0.55 * text_score + + 0.35 * vector_score + + 0.10 * result.confidence + ) + ranked = replace( + text_match or result, + vector_score=max(vector_score, (text_match or result).vector_score), + text_score=text_score, + hybrid_score=max(hybrid_score, (text_match or result).hybrid_score), + matched_field=(text_match or result).matched_field or "semantic", + ) + by_key[self._search_result_key(ranked)] = ranked + return sorted( + by_key.values(), + key=lambda result: ( + -result.hybrid_score, + -result.vector_score, + -result.confidence, + result.repository_name.lower(), + result.match_type, + result.match_name.lower(), + ), + ) + + def _semantic_candidates( + self, + *, + status: str | None, + language: str | None, + framework: str | None, + ability: str | None, + capability: str | None, + ) -> list[tuple[str, SearchResult]]: + candidates: list[tuple[str, SearchResult]] = [] + for repository in self.store.list_repositories(): + if status and repository.status != status: + continue + facts = self.store.list_observed_facts(repository.id) + if not self._repository_matches_observed_filter(facts, "language", language): + continue + if not self._repository_matches_observed_filter(facts, "framework", framework): + continue + ability_map = self.store.get_ability_map(repository.id) + if not self._ability_map_matches_filter( + ability_map, + ability=ability, + capability=capability, + ): + continue + candidates.extend(self._approved_entry_candidates(ability_map)) + candidates.extend(self._content_chunk_candidates(repository, ability_map)) + return candidates + + def _approved_entry_candidates( + self, + ability_map: RepositoryAbilityMap, + ) -> list[tuple[str, SearchResult]]: + candidates: list[tuple[str, SearchResult]] = [] + repository = ability_map.repository + for ability in ability_map.abilities: + ability_text = f"{ability.name} {ability.description}" + candidates.append( + ( + ability_text, + SearchResult( + repository_id=repository.id, + repository_name=repository.name, + match_type="ability", + match_name=ability.name, + confidence=ability.confidence, + confidence_label=ability.confidence_label, + match_description=ability.description, + matched_field="semantic", + ability_id=ability.id, + ability_name=ability.name, + ), + ) + ) + for capability in ability.capabilities: + capability_text = " ".join( + [ + ability.name, + capability.name, + capability.description, + " ".join(capability.inputs), + " ".join(capability.outputs), + ] + ) + candidates.append( + ( + capability_text, + SearchResult( + repository_id=repository.id, + repository_name=repository.name, + match_type="capability", + match_name=capability.name, + confidence=capability.confidence, + confidence_label=capability.confidence_label, + match_description=capability.description, + matched_field="semantic", + ability_id=ability.id, + ability_name=ability.name, + capability_id=capability.id, + capability_name=capability.name, + ), + ) + ) + return candidates + + def _content_chunk_candidates( + self, + repository: Repository, + ability_map: RepositoryAbilityMap, + ) -> list[tuple[str, SearchResult]]: + if not ability_map.abilities: + return [] + chunks = self.store.list_content_chunks(repository.id) + candidates: list[tuple[str, SearchResult]] = [] + for chunk in chunks: + candidates.append( + ( + chunk.text, + SearchResult( + repository_id=repository.id, + repository_name=repository.name, + match_type="content_chunk", + match_name=f"{chunk.path}:{chunk.start_line}-{chunk.end_line}", + confidence=0.5, + confidence_label="medium", + match_description=chunk.text[:240], + matched_field="semantic", + source_reference=f"{chunk.path}:{chunk.start_line}", + ), + ) + ) + return candidates + + def _repository_matches_observed_filter( + self, + facts: Sequence[ObservedFact], + kind: str, + expected: str | None, + ) -> bool: + if not expected: + return True + expected_lower = expected.lower() + return any( + fact.kind == kind and expected_lower in fact.name.lower() + for fact in facts + ) + + def _ability_map_matches_filter( + self, + ability_map: RepositoryAbilityMap, + *, + ability: str | None, + capability: str | None, + ) -> bool: + if not ability and not capability: + return True + ability_lower = ability.lower() if ability else None + capability_lower = capability.lower() if capability else None + for approved_ability in ability_map.abilities: + ability_matches = ( + ability_lower is None + or ability_lower in approved_ability.name.lower() + or ability_lower in approved_ability.description.lower() + ) + if not ability_matches: + continue + if capability_lower is None: + return True + for approved_capability in approved_ability.capabilities: + if ( + capability_lower in approved_capability.name.lower() + or capability_lower in approved_capability.description.lower() + ): + return True + return False + + def _search_result_key(self, result: SearchResult) -> tuple[object, ...]: + return ( + result.repository_id, + result.match_type, + result.ability_id, + result.capability_id, + result.match_name, + result.source_reference, + ) diff --git a/src/repo_registry/semantic/__init__.py b/src/repo_registry/semantic/__init__.py new file mode 100644 index 0000000..9622ce4 --- /dev/null +++ b/src/repo_registry/semantic/__init__.py @@ -0,0 +1,11 @@ +from repo_registry.semantic.embeddings import ( + EmbeddingProvider, + HashingEmbeddingProvider, + cosine_similarity, +) + +__all__ = [ + "EmbeddingProvider", + "HashingEmbeddingProvider", + "cosine_similarity", +] diff --git a/src/repo_registry/semantic/embeddings.py b/src/repo_registry/semantic/embeddings.py new file mode 100644 index 0000000..8a4214d --- /dev/null +++ b/src/repo_registry/semantic/embeddings.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import hashlib +import math +import re +from typing import Protocol + + +class EmbeddingProvider(Protocol): + name: str + + def embed(self, text: str) -> list[float]: + """Return a deterministic vector for the supplied text.""" + + +class HashingEmbeddingProvider: + """Offline test provider using hashed token buckets. + + This is intentionally simple: it gives tests and local development a stable + semantic path without depending on an external model service. + """ + + name = "hashing-v1" + + def __init__(self, dimensions: int = 64) -> None: + self.dimensions = dimensions + + def embed(self, text: str) -> list[float]: + vector = [0.0] * self.dimensions + for token in _tokens(text): + digest = hashlib.sha256(token.encode("utf-8")).digest() + index = int.from_bytes(digest[:2], "big") % self.dimensions + sign = 1.0 if digest[2] % 2 == 0 else -1.0 + vector[index] += sign + norm = math.sqrt(sum(value * value for value in vector)) + if norm == 0: + return vector + return [value / norm for value in vector] + + +def cosine_similarity(left: list[float], right: list[float]) -> float: + if not left or not right or len(left) != len(right): + return 0.0 + return sum(a * b for a, b in zip(left, right, strict=True)) + + +def _tokens(text: str) -> list[str]: + tokens = [] + for token in re.findall(r"[A-Za-z0-9]+", text.lower()): + tokens.append(_stem(token)) + return tokens + + +def _stem(token: str) -> str: + for suffix in ("ing", "ed", "es", "s"): + if len(token) > len(suffix) + 3 and token.endswith(suffix): + return token[: -len(suffix)] + return token diff --git a/src/repo_registry/storage/sqlite.py b/src/repo_registry/storage/sqlite.py index 9d0bca9..5fdf6e8 100644 --- a/src/repo_registry/storage/sqlite.py +++ b/src/repo_registry/storage/sqlite.py @@ -1812,6 +1812,8 @@ class RegistryStore: confidence_label=confidence_label(1.0), match_description=row["description"] or "", matched_field=matched_field, + text_score=1.0, + hybrid_score=1.0, ) ) for row in ability_rows: @@ -1830,6 +1832,8 @@ class RegistryStore: matched_field=matched_field, ability_id=row["ability_id"], ability_name=row["ability_name"], + text_score=1.0, + hybrid_score=row["confidence"], ) ) for row in capability_rows: @@ -1852,6 +1856,8 @@ class RegistryStore: ability_name=row["ability_name"], capability_id=row["capability_id"], capability_name=row["capability_name"], + text_score=1.0, + hybrid_score=row["confidence"], ) ) for row in feature_rows: @@ -1878,6 +1884,8 @@ class RegistryStore: capability_id=row["capability_id"], capability_name=row["capability_name"], source_reference=row["location"], + text_score=1.0, + hybrid_score=row["confidence"], ) ) for row in evidence_rows: @@ -1907,11 +1915,14 @@ class RegistryStore: capability_name=row["capability_name"], evidence_level=row["strength"], source_reference=row["reference"], + text_score=1.0, + hybrid_score=self._evidence_confidence(row["strength"]), ) ) return sorted( results, key=lambda result: ( + -result.hybrid_score, -result.confidence, result.repository_name.lower(), result.match_type, diff --git a/src/repo_registry/web_api/app.py b/src/repo_registry/web_api/app.py index ecae577..2772552 100644 --- a/src/repo_registry/web_api/app.py +++ b/src/repo_registry/web_api/app.py @@ -11,6 +11,7 @@ from pydantic_settings import BaseSettings, SettingsConfigDict from repo_registry.core.service import RegistryService from repo_registry.llm_extraction import LLMCandidateExtractor, create_llm_connect_adapter from repo_registry.repo_ingestion.git import GitIngestionService +from repo_registry.semantic import HashingEmbeddingProvider from repo_registry.storage.sqlite import NotFoundError, RegistryStore from repo_registry.web_api.schemas import ( AbilityCreate, @@ -60,6 +61,7 @@ class Settings(BaseSettings): checkout_root: str = Field(default="var/checkouts") llm_provider: str | None = Field(default=None) llm_model: str | None = Field(default=None) + embedding_provider: str | None = Field(default=None) def get_settings() -> Settings: @@ -78,10 +80,14 @@ def get_service(settings: Settings = Depends(get_settings)) -> RegistryService: model=settings.llm_model, ) llm_extractor = LLMCandidateExtractor(adapter) + embedding_provider = None + if settings.embedding_provider == "hashing": + embedding_provider = HashingEmbeddingProvider() return RegistryService( store, ingestion=GitIngestionService(settings.checkout_root), llm_extractor=llm_extractor, + embedding_provider=embedding_provider, ) diff --git a/src/repo_registry/web_api/schemas.py b/src/repo_registry/web_api/schemas.py index 04457e3..0329b5a 100644 --- a/src/repo_registry/web_api/schemas.py +++ b/src/repo_registry/web_api/schemas.py @@ -654,6 +654,9 @@ class SearchResultResponse(BaseModel): capability_name: str | None = None evidence_level: str | None = None source_reference: str | None = None + text_score: float = 0.0 + vector_score: float = 0.0 + hybrid_score: float = 0.0 model_config = { "json_schema_extra": { @@ -673,6 +676,9 @@ class SearchResultResponse(BaseModel): "capability_name": "Classify Incoming Email", "evidence_level": None, "source_reference": None, + "text_score": 1.0, + "vector_score": 0.0, + "hybrid_score": 0.88, } ] } diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index a5e657e..4f67ef6 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -3,6 +3,7 @@ import subprocess from repo_registry.core.service import RegistryService from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability from repo_registry.repo_ingestion.git import GitIngestionService +from repo_registry.semantic import HashingEmbeddingProvider from repo_registry.storage.sqlite import NotFoundError, RegistryStore @@ -290,6 +291,60 @@ def test_search_filters_by_status_language_and_framework(tmp_path): assert wrong_capability_results == [] +def test_semantic_search_adds_hybrid_matches_without_changing_text_default(tmp_path): + source = tmp_path / "repo" + source.mkdir() + (source / "README.md").write_text( + "# Queue Worker\n\nHandles postponed customer jobs.\n", + encoding="utf-8", + ) + + text_service = make_service(tmp_path) + repository = text_service.register_repository( + name="Queue Worker", + url=str(source), + description="Processes deferred customer work.", + ) + ability_id = text_service.add_ability( + repository.id, + name="Background Job Processing", + description="Run deferred work outside request handling.", + confidence=0.8, + ) + capability_id = text_service.add_capability( + repository.id, + ability_id, + name="Process Customer Tasks", + description="Execute queued customer tasks asynchronously.", + confidence=0.7, + ) + text_service.add_feature( + repository.id, + capability_id, + name="worker task loop", + type="background worker", + location="worker.py", + confidence=0.6, + ) + text_service.analyze_repository(repository.id) + + assert text_service.search("customer queued") == [] + + semantic_service = RegistryService( + text_service.store, + ingestion=GitIngestionService(tmp_path / "checkouts"), + embedding_provider=HashingEmbeddingProvider(), + ) + results = semantic_service.search("customer queued") + + assert results + assert results[0].match_type in {"capability", "content_chunk"} + assert results[0].matched_field == "semantic" + assert results[0].vector_score > 0 + assert results[0].hybrid_score >= results[0].vector_score * 0.35 + assert any(result.match_type == "content_chunk" for result in results) + + def test_register_repository_imports_metadata_when_name_is_omitted(tmp_path): source = tmp_path / "metadata-source" source.mkdir() diff --git a/tests/test_web_api.py b/tests/test_web_api.py index bab3e0e..f6bce39 100644 --- a/tests/test_web_api.py +++ b/tests/test_web_api.py @@ -410,11 +410,25 @@ def test_api_service_settings_can_enable_llm_extractor(monkeypatch, tmp_path): assert service.llm_extractor is not None +def test_api_service_settings_can_enable_hashing_embedding_provider(tmp_path): + service = get_service( + Settings( + database_path=str(tmp_path / "embedding-settings.sqlite3"), + checkout_root=str(tmp_path / "checkouts"), + embedding_provider="hashing", + ) + ) + + assert service.embedding_provider is not None + assert service.embedding_provider.name == "hashing-v1" + + def test_settings_can_load_from_environment(monkeypatch): monkeypatch.setenv("REPO_REGISTRY_DATABASE_PATH", "var/env.sqlite3") monkeypatch.setenv("REPO_REGISTRY_CHECKOUT_ROOT", "var/env-checkouts") monkeypatch.setenv("REPO_REGISTRY_LLM_PROVIDER", "mock") monkeypatch.setenv("REPO_REGISTRY_LLM_MODEL", "demo-model") + monkeypatch.setenv("REPO_REGISTRY_EMBEDDING_PROVIDER", "hashing") settings = Settings() @@ -422,6 +436,7 @@ def test_settings_can_load_from_environment(monkeypatch): assert settings.checkout_root == "var/env-checkouts" assert settings.llm_provider == "mock" assert settings.llm_model == "demo-model" + assert settings.embedding_provider == "hashing" def test_api_analysis_run_loop(tmp_path): diff --git a/workplans/RREG-WP-0002-production-hardening.md b/workplans/RREG-WP-0002-production-hardening.md index 99bd953..d034fd2 100644 --- a/workplans/RREG-WP-0002-production-hardening.md +++ b/workplans/RREG-WP-0002-production-hardening.md @@ -42,7 +42,7 @@ Candidate endpoints: ```task id: RREG-WP-0002-T02 -status: todo +status: done priority: medium state_hub_task_id: "0e7cce78-13ab-4aa2-8d25-ae50ff8ccd74" ```