Deduplication

This commit is contained in:
2026-04-29 01:35:17 +02:00
parent 991c34ce52
commit 8bd22dab1b
5 changed files with 400 additions and 0 deletions

View File

@@ -0,0 +1,267 @@
from __future__ import annotations
import re
from dataclasses import replace
from repo_registry.candidate_graph.generator import (
CandidateAbilityDraft,
CandidateCapabilityDraft,
CandidateEvidenceDraft,
CandidateFeatureDraft,
)
from repo_registry.core.models import SourceReference
STOP_WORDS = {
"a",
"an",
"and",
"capability",
"feature",
"for",
"models",
"model",
"of",
"support",
"supports",
"the",
"to",
"use",
"uses",
"using",
}
DISTINCTIVE_TOKENS = {
"anthropic",
"claude",
"gemini",
"openai",
"openrouter",
}
def normalize_candidate_drafts(
abilities: list[CandidateAbilityDraft],
) -> list[CandidateAbilityDraft]:
return _merge_abilities(abilities)
def _merge_abilities(
abilities: list[CandidateAbilityDraft],
) -> list[CandidateAbilityDraft]:
merged: list[CandidateAbilityDraft] = []
for ability in abilities:
index = _find_overlap(merged, ability.name)
if index is None:
merged.append(
replace(
ability,
capabilities=_merge_capabilities(ability.capabilities),
)
)
continue
merged[index] = _combine_abilities(merged[index], ability)
return merged
def _combine_abilities(
left: CandidateAbilityDraft,
right: CandidateAbilityDraft,
) -> CandidateAbilityDraft:
return CandidateAbilityDraft(
name=_preferred_name(left.name, right.name),
description=_preferred_description(left.description, right.description),
confidence=max(left.confidence, right.confidence),
source_refs=_merge_source_refs(left.source_refs, right.source_refs),
capabilities=_merge_capabilities(left.capabilities + right.capabilities),
)
def _merge_capabilities(
capabilities: list[CandidateCapabilityDraft],
) -> list[CandidateCapabilityDraft]:
merged: list[CandidateCapabilityDraft] = []
for capability in capabilities:
index = _find_overlap(merged, capability.name)
if index is None:
merged.append(
replace(
capability,
features=_merge_features(capability.features),
evidence=_merge_evidence(capability.evidence),
)
)
continue
merged[index] = _combine_capabilities(merged[index], capability)
return merged
def _combine_capabilities(
left: CandidateCapabilityDraft,
right: CandidateCapabilityDraft,
) -> CandidateCapabilityDraft:
return CandidateCapabilityDraft(
name=_preferred_name(left.name, right.name),
description=_preferred_description(left.description, right.description),
inputs=_merge_strings(left.inputs, right.inputs),
outputs=_merge_strings(left.outputs, right.outputs),
confidence=max(left.confidence, right.confidence),
source_refs=_merge_source_refs(left.source_refs, right.source_refs),
features=_merge_features(left.features + right.features),
evidence=_merge_evidence(left.evidence + right.evidence),
)
def _merge_features(
features: list[CandidateFeatureDraft],
) -> list[CandidateFeatureDraft]:
merged: list[CandidateFeatureDraft] = []
for feature in features:
index = _find_overlap(merged, feature.name)
if index is None:
merged.append(feature)
continue
existing = merged[index]
merged[index] = CandidateFeatureDraft(
name=_preferred_name(existing.name, feature.name),
type=_preferred_text(existing.type, feature.type),
location=_preferred_text(existing.location, feature.location),
confidence=max(existing.confidence, feature.confidence),
source_refs=_merge_source_refs(existing.source_refs, feature.source_refs),
)
return merged
def _merge_evidence(
evidence_items: list[CandidateEvidenceDraft],
) -> list[CandidateEvidenceDraft]:
merged: list[CandidateEvidenceDraft] = []
seen: set[tuple[str, str]] = set()
for evidence in evidence_items:
key = (_normalize_text(evidence.type), _normalize_path(evidence.reference))
if key not in seen:
seen.add(key)
merged.append(evidence)
continue
index = next(
index
for index, item in enumerate(merged)
if (_normalize_text(item.type), _normalize_path(item.reference)) == key
)
existing = merged[index]
merged[index] = CandidateEvidenceDraft(
type=_preferred_text(existing.type, evidence.type),
reference=_preferred_text(existing.reference, evidence.reference),
strength=_stronger_evidence(existing.strength, evidence.strength),
source_refs=_merge_source_refs(existing.source_refs, evidence.source_refs),
)
return merged
def _find_overlap(items: list, name: str) -> int | None:
for index, item in enumerate(items):
if _names_overlap(item.name, name):
return index
return None
def _names_overlap(left: str, right: str) -> bool:
if _normalize_text(left) == _normalize_text(right):
return True
left_tokens = _tokens(left)
right_tokens = _tokens(right)
if not left_tokens or not right_tokens:
return False
intersection = left_tokens & right_tokens
union = left_tokens | right_tokens
overlap = len(intersection) / len(union)
containment = len(intersection) / min(len(left_tokens), len(right_tokens))
if intersection & DISTINCTIVE_TOKENS and containment >= 0.8:
return True
return overlap >= 0.6 or (containment >= 0.8 and len(intersection) >= 2)
def _tokens(value: str) -> set[str]:
return {
_stem(token)
for token in re.findall(r"[a-z0-9]+", value.lower())
if token not in STOP_WORDS
}
def _stem(token: str) -> str:
if token.endswith("ies") and len(token) > 4:
return f"{token[:-3]}y"
if token.endswith("s") and len(token) > 3:
return token[:-1]
return token
def _normalize_text(value: str) -> str:
return " ".join(sorted(_tokens(value)))
def _normalize_path(value: str) -> str:
return value.strip().lower()
def _preferred_name(left: str, right: str) -> str:
return _preferred_text(left, right)
def _preferred_description(left: str, right: str) -> str:
if not left.strip():
return right.strip()
if not right.strip():
return left.strip()
if _normalize_sentence(left) == _normalize_sentence(right):
return max((left.strip(), right.strip()), key=len)
return max((left.strip(), right.strip()), key=len)
def _normalize_sentence(value: str) -> str:
return re.sub(r"\s+", " ", value.strip().lower())
def _preferred_text(left: str, right: str) -> str:
if not left:
return right
if not right:
return left
return max((left, right), key=lambda item: (len(_tokens(item)), len(item)))
def _merge_strings(left: list[str], right: list[str]) -> list[str]:
merged: list[str] = []
seen: set[str] = set()
for value in left + right:
key = _normalize_value(value)
if key in seen:
continue
seen.add(key)
merged.append(value)
return merged
def _normalize_value(value: str) -> str:
return " ".join(re.findall(r"[a-z0-9]+", value.lower()))
def _merge_source_refs(
left: list[SourceReference],
right: list[SourceReference],
) -> list[SourceReference]:
merged: list[SourceReference] = []
seen: set[tuple[int | None, str, str, str, int | None]] = set()
for ref in left + right:
key = (ref.fact_id, ref.path, ref.kind, ref.name, ref.line)
if key in seen:
continue
seen.add(key)
merged.append(ref)
return merged
def _stronger_evidence(left: str, right: str) -> str:
ranking = {"weak": 0, "medium": 1, "strong": 2}
return left if ranking.get(left, 1) >= ranking.get(right, 1) else right

View File

@@ -25,6 +25,7 @@ from repo_registry.core.models import (
SearchResult,
)
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
from repo_registry.candidate_graph.normalization import normalize_candidate_drafts
from repo_registry.content_indexing.extractor import ContentExtractor
from repo_registry.core.logging import log_operation
from repo_registry.llm_extraction.extractor import LLMCandidateExtractor
@@ -211,6 +212,7 @@ class RegistryService:
stored_chunks,
)
candidate_source = "deterministic"
candidates = normalize_candidate_drafts(candidates)
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
if candidate_source == "llm":
log_operation(

View File

@@ -0,0 +1,89 @@
from repo_registry.candidate_graph.generator import (
CandidateAbilityDraft,
CandidateCapabilityDraft,
CandidateFeatureDraft,
)
from repo_registry.candidate_graph.normalization import normalize_candidate_drafts
from repo_registry.core.models import SourceReference
def ref(fact_id, path):
return SourceReference(
fact_id=fact_id,
path=path,
kind="documentation",
name=path,
)
def test_normalizer_merges_duplicate_abilities_and_nested_claims():
candidates = [
CandidateAbilityDraft(
name="LLM Provider Integration",
description="Connects to model providers.",
confidence=0.55,
source_refs=[ref(1, "README.md")],
capabilities=[
CandidateCapabilityDraft(
name="Use OpenRouter Models",
description="Calls OpenRouter.",
inputs=["prompt"],
outputs=["response"],
confidence=0.6,
source_refs=[ref(2, "providers.py")],
features=[
CandidateFeatureDraft(
name="OpenRouter Model Support",
type="integration",
location="providers.py",
confidence=0.6,
source_refs=[ref(2, "providers.py")],
)
],
)
],
),
CandidateAbilityDraft(
name="LLM Provider Integrations",
description="Connects prompts to OpenRouter and Claude providers.",
confidence=0.7,
source_refs=[ref(3, "providers.py")],
capabilities=[
CandidateCapabilityDraft(
name="OpenRouter Model Support",
description="Supports OpenRouter model calls.",
inputs=["LLM request"],
outputs=["model response"],
confidence=0.75,
source_refs=[ref(3, "providers.py")],
features=[
CandidateFeatureDraft(
name="Use OpenRouter Models",
type="backend integration",
location="src/providers.py",
confidence=0.75,
source_refs=[ref(3, "providers.py")],
)
],
)
],
),
]
normalized = normalize_candidate_drafts(candidates)
assert len(normalized) == 1
ability = normalized[0]
assert ability.name == "LLM Provider Integrations"
assert ability.description == (
"Connects prompts to OpenRouter and Claude providers."
)
assert ability.confidence == 0.7
assert {ref.fact_id for ref in ability.source_refs} == {1, 3}
assert len(ability.capabilities) == 1
capability = ability.capabilities[0]
assert capability.confidence == 0.75
assert capability.inputs == ["prompt", "LLM request"]
assert capability.outputs == ["response", "model response"]
assert len(capability.features) == 1
assert capability.features[0].confidence == 0.75

View File

@@ -641,6 +641,43 @@ def test_analyze_repository_can_disable_optional_llm_extractor(tmp_path):
assert all(decision.action != "llm_extraction_used" for decision in decisions)
def test_analyze_repository_normalizes_duplicate_llm_candidates(tmp_path):
source = tmp_path / "repo"
source.mkdir()
(source / "README.md").write_text(
"# LLM Connect\nSupports OpenRouter providers.\n",
encoding="utf-8",
)
store = RegistryStore(tmp_path / "registry.sqlite3")
store.initialize()
extractor = FakeLLMExtractor(
[
ExtractedAbility(
name="LLM Provider Integration",
description="Connects to model providers.",
source_paths=["README.md"],
),
ExtractedAbility(
name="LLM Provider Integrations",
description="Connects prompts to OpenRouter providers.",
source_paths=["README.md"],
),
]
)
service = RegistryService(
store,
ingestion=GitIngestionService(tmp_path / "checkouts"),
llm_extractor=extractor,
)
repository = service.register_repository(name="LLM Connect", url=str(source))
summary = service.analyze_repository(repository.id)
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
assert len(graph.abilities) == 1
assert graph.abilities[0].name == "LLM Provider Integrations"
def test_analyze_repository_falls_back_when_optional_llm_extractor_returns_no_candidates(tmp_path):
source = tmp_path / "repo"
source.mkdir()

View File

@@ -196,3 +196,8 @@ Acceptance: the repository has at least one expectation fixture for an LLM
infrastructure repo and a test that fails if deterministic analysis stops
surfacing expected provider concepts. The workflow remains LLM-optional: LLMs may
suggest expectations, but deterministic tests encode the accepted learning.
Follow-up hardening completed 2026-04-29: candidate graphs are normalized before
storage so duplicate or overlapping LLM/deterministic claims merge into one
review item while preserving stronger descriptions, confidence, source refs, and
nested capabilities/features/evidence.