diff --git a/src/repo_registry/candidate_graph/normalization.py b/src/repo_registry/candidate_graph/normalization.py new file mode 100644 index 0000000..bd837cc --- /dev/null +++ b/src/repo_registry/candidate_graph/normalization.py @@ -0,0 +1,267 @@ +from __future__ import annotations + +import re +from dataclasses import replace + +from repo_registry.candidate_graph.generator import ( + CandidateAbilityDraft, + CandidateCapabilityDraft, + CandidateEvidenceDraft, + CandidateFeatureDraft, +) +from repo_registry.core.models import SourceReference + + +STOP_WORDS = { + "a", + "an", + "and", + "capability", + "feature", + "for", + "models", + "model", + "of", + "support", + "supports", + "the", + "to", + "use", + "uses", + "using", +} + +DISTINCTIVE_TOKENS = { + "anthropic", + "claude", + "gemini", + "openai", + "openrouter", +} + + +def normalize_candidate_drafts( + abilities: list[CandidateAbilityDraft], +) -> list[CandidateAbilityDraft]: + return _merge_abilities(abilities) + + +def _merge_abilities( + abilities: list[CandidateAbilityDraft], +) -> list[CandidateAbilityDraft]: + merged: list[CandidateAbilityDraft] = [] + for ability in abilities: + index = _find_overlap(merged, ability.name) + if index is None: + merged.append( + replace( + ability, + capabilities=_merge_capabilities(ability.capabilities), + ) + ) + continue + merged[index] = _combine_abilities(merged[index], ability) + return merged + + +def _combine_abilities( + left: CandidateAbilityDraft, + right: CandidateAbilityDraft, +) -> CandidateAbilityDraft: + return CandidateAbilityDraft( + name=_preferred_name(left.name, right.name), + description=_preferred_description(left.description, right.description), + confidence=max(left.confidence, right.confidence), + source_refs=_merge_source_refs(left.source_refs, right.source_refs), + capabilities=_merge_capabilities(left.capabilities + right.capabilities), + ) + + +def _merge_capabilities( + capabilities: list[CandidateCapabilityDraft], +) -> list[CandidateCapabilityDraft]: + merged: list[CandidateCapabilityDraft] = [] + for capability in capabilities: + index = _find_overlap(merged, capability.name) + if index is None: + merged.append( + replace( + capability, + features=_merge_features(capability.features), + evidence=_merge_evidence(capability.evidence), + ) + ) + continue + merged[index] = _combine_capabilities(merged[index], capability) + return merged + + +def _combine_capabilities( + left: CandidateCapabilityDraft, + right: CandidateCapabilityDraft, +) -> CandidateCapabilityDraft: + return CandidateCapabilityDraft( + name=_preferred_name(left.name, right.name), + description=_preferred_description(left.description, right.description), + inputs=_merge_strings(left.inputs, right.inputs), + outputs=_merge_strings(left.outputs, right.outputs), + confidence=max(left.confidence, right.confidence), + source_refs=_merge_source_refs(left.source_refs, right.source_refs), + features=_merge_features(left.features + right.features), + evidence=_merge_evidence(left.evidence + right.evidence), + ) + + +def _merge_features( + features: list[CandidateFeatureDraft], +) -> list[CandidateFeatureDraft]: + merged: list[CandidateFeatureDraft] = [] + for feature in features: + index = _find_overlap(merged, feature.name) + if index is None: + merged.append(feature) + continue + existing = merged[index] + merged[index] = CandidateFeatureDraft( + name=_preferred_name(existing.name, feature.name), + type=_preferred_text(existing.type, feature.type), + location=_preferred_text(existing.location, feature.location), + confidence=max(existing.confidence, feature.confidence), + source_refs=_merge_source_refs(existing.source_refs, feature.source_refs), + ) + return merged + + +def _merge_evidence( + evidence_items: list[CandidateEvidenceDraft], +) -> list[CandidateEvidenceDraft]: + merged: list[CandidateEvidenceDraft] = [] + seen: set[tuple[str, str]] = set() + for evidence in evidence_items: + key = (_normalize_text(evidence.type), _normalize_path(evidence.reference)) + if key not in seen: + seen.add(key) + merged.append(evidence) + continue + index = next( + index + for index, item in enumerate(merged) + if (_normalize_text(item.type), _normalize_path(item.reference)) == key + ) + existing = merged[index] + merged[index] = CandidateEvidenceDraft( + type=_preferred_text(existing.type, evidence.type), + reference=_preferred_text(existing.reference, evidence.reference), + strength=_stronger_evidence(existing.strength, evidence.strength), + source_refs=_merge_source_refs(existing.source_refs, evidence.source_refs), + ) + return merged + + +def _find_overlap(items: list, name: str) -> int | None: + for index, item in enumerate(items): + if _names_overlap(item.name, name): + return index + return None + + +def _names_overlap(left: str, right: str) -> bool: + if _normalize_text(left) == _normalize_text(right): + return True + left_tokens = _tokens(left) + right_tokens = _tokens(right) + if not left_tokens or not right_tokens: + return False + intersection = left_tokens & right_tokens + union = left_tokens | right_tokens + overlap = len(intersection) / len(union) + containment = len(intersection) / min(len(left_tokens), len(right_tokens)) + if intersection & DISTINCTIVE_TOKENS and containment >= 0.8: + return True + return overlap >= 0.6 or (containment >= 0.8 and len(intersection) >= 2) + + +def _tokens(value: str) -> set[str]: + return { + _stem(token) + for token in re.findall(r"[a-z0-9]+", value.lower()) + if token not in STOP_WORDS + } + + +def _stem(token: str) -> str: + if token.endswith("ies") and len(token) > 4: + return f"{token[:-3]}y" + if token.endswith("s") and len(token) > 3: + return token[:-1] + return token + + +def _normalize_text(value: str) -> str: + return " ".join(sorted(_tokens(value))) + + +def _normalize_path(value: str) -> str: + return value.strip().lower() + + +def _preferred_name(left: str, right: str) -> str: + return _preferred_text(left, right) + + +def _preferred_description(left: str, right: str) -> str: + if not left.strip(): + return right.strip() + if not right.strip(): + return left.strip() + if _normalize_sentence(left) == _normalize_sentence(right): + return max((left.strip(), right.strip()), key=len) + return max((left.strip(), right.strip()), key=len) + + +def _normalize_sentence(value: str) -> str: + return re.sub(r"\s+", " ", value.strip().lower()) + + +def _preferred_text(left: str, right: str) -> str: + if not left: + return right + if not right: + return left + return max((left, right), key=lambda item: (len(_tokens(item)), len(item))) + + +def _merge_strings(left: list[str], right: list[str]) -> list[str]: + merged: list[str] = [] + seen: set[str] = set() + for value in left + right: + key = _normalize_value(value) + if key in seen: + continue + seen.add(key) + merged.append(value) + return merged + + +def _normalize_value(value: str) -> str: + return " ".join(re.findall(r"[a-z0-9]+", value.lower())) + + +def _merge_source_refs( + left: list[SourceReference], + right: list[SourceReference], +) -> list[SourceReference]: + merged: list[SourceReference] = [] + seen: set[tuple[int | None, str, str, str, int | None]] = set() + for ref in left + right: + key = (ref.fact_id, ref.path, ref.kind, ref.name, ref.line) + if key in seen: + continue + seen.add(key) + merged.append(ref) + return merged + + +def _stronger_evidence(left: str, right: str) -> str: + ranking = {"weak": 0, "medium": 1, "strong": 2} + return left if ranking.get(left, 1) >= ranking.get(right, 1) else right diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 20556b0..7c5fb1e 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -25,6 +25,7 @@ from repo_registry.core.models import ( SearchResult, ) from repo_registry.candidate_graph.generator import CandidateGraphGenerator +from repo_registry.candidate_graph.normalization import normalize_candidate_drafts from repo_registry.content_indexing.extractor import ContentExtractor from repo_registry.core.logging import log_operation from repo_registry.llm_extraction.extractor import LLMCandidateExtractor @@ -211,6 +212,7 @@ class RegistryService: stored_chunks, ) candidate_source = "deterministic" + candidates = normalize_candidate_drafts(candidates) self.store.replace_candidate_graph(repository_id, completed_run.id, candidates) if candidate_source == "llm": log_operation( diff --git a/tests/test_candidate_normalization.py b/tests/test_candidate_normalization.py new file mode 100644 index 0000000..e951f95 --- /dev/null +++ b/tests/test_candidate_normalization.py @@ -0,0 +1,89 @@ +from repo_registry.candidate_graph.generator import ( + CandidateAbilityDraft, + CandidateCapabilityDraft, + CandidateFeatureDraft, +) +from repo_registry.candidate_graph.normalization import normalize_candidate_drafts +from repo_registry.core.models import SourceReference + + +def ref(fact_id, path): + return SourceReference( + fact_id=fact_id, + path=path, + kind="documentation", + name=path, + ) + + +def test_normalizer_merges_duplicate_abilities_and_nested_claims(): + candidates = [ + CandidateAbilityDraft( + name="LLM Provider Integration", + description="Connects to model providers.", + confidence=0.55, + source_refs=[ref(1, "README.md")], + capabilities=[ + CandidateCapabilityDraft( + name="Use OpenRouter Models", + description="Calls OpenRouter.", + inputs=["prompt"], + outputs=["response"], + confidence=0.6, + source_refs=[ref(2, "providers.py")], + features=[ + CandidateFeatureDraft( + name="OpenRouter Model Support", + type="integration", + location="providers.py", + confidence=0.6, + source_refs=[ref(2, "providers.py")], + ) + ], + ) + ], + ), + CandidateAbilityDraft( + name="LLM Provider Integrations", + description="Connects prompts to OpenRouter and Claude providers.", + confidence=0.7, + source_refs=[ref(3, "providers.py")], + capabilities=[ + CandidateCapabilityDraft( + name="OpenRouter Model Support", + description="Supports OpenRouter model calls.", + inputs=["LLM request"], + outputs=["model response"], + confidence=0.75, + source_refs=[ref(3, "providers.py")], + features=[ + CandidateFeatureDraft( + name="Use OpenRouter Models", + type="backend integration", + location="src/providers.py", + confidence=0.75, + source_refs=[ref(3, "providers.py")], + ) + ], + ) + ], + ), + ] + + normalized = normalize_candidate_drafts(candidates) + + assert len(normalized) == 1 + ability = normalized[0] + assert ability.name == "LLM Provider Integrations" + assert ability.description == ( + "Connects prompts to OpenRouter and Claude providers." + ) + assert ability.confidence == 0.7 + assert {ref.fact_id for ref in ability.source_refs} == {1, 3} + assert len(ability.capabilities) == 1 + capability = ability.capabilities[0] + assert capability.confidence == 0.75 + assert capability.inputs == ["prompt", "LLM request"] + assert capability.outputs == ["response", "model response"] + assert len(capability.features) == 1 + assert capability.features[0].confidence == 0.75 diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index 5e8ee26..2ac5cac 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -641,6 +641,43 @@ def test_analyze_repository_can_disable_optional_llm_extractor(tmp_path): assert all(decision.action != "llm_extraction_used" for decision in decisions) +def test_analyze_repository_normalizes_duplicate_llm_candidates(tmp_path): + source = tmp_path / "repo" + source.mkdir() + (source / "README.md").write_text( + "# LLM Connect\nSupports OpenRouter providers.\n", + encoding="utf-8", + ) + store = RegistryStore(tmp_path / "registry.sqlite3") + store.initialize() + extractor = FakeLLMExtractor( + [ + ExtractedAbility( + name="LLM Provider Integration", + description="Connects to model providers.", + source_paths=["README.md"], + ), + ExtractedAbility( + name="LLM Provider Integrations", + description="Connects prompts to OpenRouter providers.", + source_paths=["README.md"], + ), + ] + ) + service = RegistryService( + store, + ingestion=GitIngestionService(tmp_path / "checkouts"), + llm_extractor=extractor, + ) + repository = service.register_repository(name="LLM Connect", url=str(source)) + + summary = service.analyze_repository(repository.id) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + + assert len(graph.abilities) == 1 + assert graph.abilities[0].name == "LLM Provider Integrations" + + def test_analyze_repository_falls_back_when_optional_llm_extractor_returns_no_candidates(tmp_path): source = tmp_path / "repo" source.mkdir() diff --git a/workplans/RREG-WP-0003-automatic-repository-exploration.md b/workplans/RREG-WP-0003-automatic-repository-exploration.md index 3424971..4b1f637 100644 --- a/workplans/RREG-WP-0003-automatic-repository-exploration.md +++ b/workplans/RREG-WP-0003-automatic-repository-exploration.md @@ -196,3 +196,8 @@ Acceptance: the repository has at least one expectation fixture for an LLM infrastructure repo and a test that fails if deterministic analysis stops surfacing expected provider concepts. The workflow remains LLM-optional: LLMs may suggest expectations, but deterministic tests encode the accepted learning. + +Follow-up hardening completed 2026-04-29: candidate graphs are normalized before +storage so duplicate or overlapping LLM/deterministic claims merge into one +review item while preserving stronger descriptions, confidence, source refs, and +nested capabilities/features/evidence.