generated from coulomb/repo-seed
Deduplication
This commit is contained in:
267
src/repo_registry/candidate_graph/normalization.py
Normal file
267
src/repo_registry/candidate_graph/normalization.py
Normal file
@@ -0,0 +1,267 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import replace
|
||||
|
||||
from repo_registry.candidate_graph.generator import (
|
||||
CandidateAbilityDraft,
|
||||
CandidateCapabilityDraft,
|
||||
CandidateEvidenceDraft,
|
||||
CandidateFeatureDraft,
|
||||
)
|
||||
from repo_registry.core.models import SourceReference
|
||||
|
||||
|
||||
STOP_WORDS = {
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"capability",
|
||||
"feature",
|
||||
"for",
|
||||
"models",
|
||||
"model",
|
||||
"of",
|
||||
"support",
|
||||
"supports",
|
||||
"the",
|
||||
"to",
|
||||
"use",
|
||||
"uses",
|
||||
"using",
|
||||
}
|
||||
|
||||
DISTINCTIVE_TOKENS = {
|
||||
"anthropic",
|
||||
"claude",
|
||||
"gemini",
|
||||
"openai",
|
||||
"openrouter",
|
||||
}
|
||||
|
||||
|
||||
def normalize_candidate_drafts(
|
||||
abilities: list[CandidateAbilityDraft],
|
||||
) -> list[CandidateAbilityDraft]:
|
||||
return _merge_abilities(abilities)
|
||||
|
||||
|
||||
def _merge_abilities(
|
||||
abilities: list[CandidateAbilityDraft],
|
||||
) -> list[CandidateAbilityDraft]:
|
||||
merged: list[CandidateAbilityDraft] = []
|
||||
for ability in abilities:
|
||||
index = _find_overlap(merged, ability.name)
|
||||
if index is None:
|
||||
merged.append(
|
||||
replace(
|
||||
ability,
|
||||
capabilities=_merge_capabilities(ability.capabilities),
|
||||
)
|
||||
)
|
||||
continue
|
||||
merged[index] = _combine_abilities(merged[index], ability)
|
||||
return merged
|
||||
|
||||
|
||||
def _combine_abilities(
|
||||
left: CandidateAbilityDraft,
|
||||
right: CandidateAbilityDraft,
|
||||
) -> CandidateAbilityDraft:
|
||||
return CandidateAbilityDraft(
|
||||
name=_preferred_name(left.name, right.name),
|
||||
description=_preferred_description(left.description, right.description),
|
||||
confidence=max(left.confidence, right.confidence),
|
||||
source_refs=_merge_source_refs(left.source_refs, right.source_refs),
|
||||
capabilities=_merge_capabilities(left.capabilities + right.capabilities),
|
||||
)
|
||||
|
||||
|
||||
def _merge_capabilities(
|
||||
capabilities: list[CandidateCapabilityDraft],
|
||||
) -> list[CandidateCapabilityDraft]:
|
||||
merged: list[CandidateCapabilityDraft] = []
|
||||
for capability in capabilities:
|
||||
index = _find_overlap(merged, capability.name)
|
||||
if index is None:
|
||||
merged.append(
|
||||
replace(
|
||||
capability,
|
||||
features=_merge_features(capability.features),
|
||||
evidence=_merge_evidence(capability.evidence),
|
||||
)
|
||||
)
|
||||
continue
|
||||
merged[index] = _combine_capabilities(merged[index], capability)
|
||||
return merged
|
||||
|
||||
|
||||
def _combine_capabilities(
|
||||
left: CandidateCapabilityDraft,
|
||||
right: CandidateCapabilityDraft,
|
||||
) -> CandidateCapabilityDraft:
|
||||
return CandidateCapabilityDraft(
|
||||
name=_preferred_name(left.name, right.name),
|
||||
description=_preferred_description(left.description, right.description),
|
||||
inputs=_merge_strings(left.inputs, right.inputs),
|
||||
outputs=_merge_strings(left.outputs, right.outputs),
|
||||
confidence=max(left.confidence, right.confidence),
|
||||
source_refs=_merge_source_refs(left.source_refs, right.source_refs),
|
||||
features=_merge_features(left.features + right.features),
|
||||
evidence=_merge_evidence(left.evidence + right.evidence),
|
||||
)
|
||||
|
||||
|
||||
def _merge_features(
|
||||
features: list[CandidateFeatureDraft],
|
||||
) -> list[CandidateFeatureDraft]:
|
||||
merged: list[CandidateFeatureDraft] = []
|
||||
for feature in features:
|
||||
index = _find_overlap(merged, feature.name)
|
||||
if index is None:
|
||||
merged.append(feature)
|
||||
continue
|
||||
existing = merged[index]
|
||||
merged[index] = CandidateFeatureDraft(
|
||||
name=_preferred_name(existing.name, feature.name),
|
||||
type=_preferred_text(existing.type, feature.type),
|
||||
location=_preferred_text(existing.location, feature.location),
|
||||
confidence=max(existing.confidence, feature.confidence),
|
||||
source_refs=_merge_source_refs(existing.source_refs, feature.source_refs),
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def _merge_evidence(
|
||||
evidence_items: list[CandidateEvidenceDraft],
|
||||
) -> list[CandidateEvidenceDraft]:
|
||||
merged: list[CandidateEvidenceDraft] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for evidence in evidence_items:
|
||||
key = (_normalize_text(evidence.type), _normalize_path(evidence.reference))
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
merged.append(evidence)
|
||||
continue
|
||||
index = next(
|
||||
index
|
||||
for index, item in enumerate(merged)
|
||||
if (_normalize_text(item.type), _normalize_path(item.reference)) == key
|
||||
)
|
||||
existing = merged[index]
|
||||
merged[index] = CandidateEvidenceDraft(
|
||||
type=_preferred_text(existing.type, evidence.type),
|
||||
reference=_preferred_text(existing.reference, evidence.reference),
|
||||
strength=_stronger_evidence(existing.strength, evidence.strength),
|
||||
source_refs=_merge_source_refs(existing.source_refs, evidence.source_refs),
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def _find_overlap(items: list, name: str) -> int | None:
|
||||
for index, item in enumerate(items):
|
||||
if _names_overlap(item.name, name):
|
||||
return index
|
||||
return None
|
||||
|
||||
|
||||
def _names_overlap(left: str, right: str) -> bool:
|
||||
if _normalize_text(left) == _normalize_text(right):
|
||||
return True
|
||||
left_tokens = _tokens(left)
|
||||
right_tokens = _tokens(right)
|
||||
if not left_tokens or not right_tokens:
|
||||
return False
|
||||
intersection = left_tokens & right_tokens
|
||||
union = left_tokens | right_tokens
|
||||
overlap = len(intersection) / len(union)
|
||||
containment = len(intersection) / min(len(left_tokens), len(right_tokens))
|
||||
if intersection & DISTINCTIVE_TOKENS and containment >= 0.8:
|
||||
return True
|
||||
return overlap >= 0.6 or (containment >= 0.8 and len(intersection) >= 2)
|
||||
|
||||
|
||||
def _tokens(value: str) -> set[str]:
|
||||
return {
|
||||
_stem(token)
|
||||
for token in re.findall(r"[a-z0-9]+", value.lower())
|
||||
if token not in STOP_WORDS
|
||||
}
|
||||
|
||||
|
||||
def _stem(token: str) -> str:
|
||||
if token.endswith("ies") and len(token) > 4:
|
||||
return f"{token[:-3]}y"
|
||||
if token.endswith("s") and len(token) > 3:
|
||||
return token[:-1]
|
||||
return token
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> str:
|
||||
return " ".join(sorted(_tokens(value)))
|
||||
|
||||
|
||||
def _normalize_path(value: str) -> str:
|
||||
return value.strip().lower()
|
||||
|
||||
|
||||
def _preferred_name(left: str, right: str) -> str:
|
||||
return _preferred_text(left, right)
|
||||
|
||||
|
||||
def _preferred_description(left: str, right: str) -> str:
|
||||
if not left.strip():
|
||||
return right.strip()
|
||||
if not right.strip():
|
||||
return left.strip()
|
||||
if _normalize_sentence(left) == _normalize_sentence(right):
|
||||
return max((left.strip(), right.strip()), key=len)
|
||||
return max((left.strip(), right.strip()), key=len)
|
||||
|
||||
|
||||
def _normalize_sentence(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", value.strip().lower())
|
||||
|
||||
|
||||
def _preferred_text(left: str, right: str) -> str:
|
||||
if not left:
|
||||
return right
|
||||
if not right:
|
||||
return left
|
||||
return max((left, right), key=lambda item: (len(_tokens(item)), len(item)))
|
||||
|
||||
|
||||
def _merge_strings(left: list[str], right: list[str]) -> list[str]:
|
||||
merged: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for value in left + right:
|
||||
key = _normalize_value(value)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
merged.append(value)
|
||||
return merged
|
||||
|
||||
|
||||
def _normalize_value(value: str) -> str:
|
||||
return " ".join(re.findall(r"[a-z0-9]+", value.lower()))
|
||||
|
||||
|
||||
def _merge_source_refs(
|
||||
left: list[SourceReference],
|
||||
right: list[SourceReference],
|
||||
) -> list[SourceReference]:
|
||||
merged: list[SourceReference] = []
|
||||
seen: set[tuple[int | None, str, str, str, int | None]] = set()
|
||||
for ref in left + right:
|
||||
key = (ref.fact_id, ref.path, ref.kind, ref.name, ref.line)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
merged.append(ref)
|
||||
return merged
|
||||
|
||||
|
||||
def _stronger_evidence(left: str, right: str) -> str:
|
||||
ranking = {"weak": 0, "medium": 1, "strong": 2}
|
||||
return left if ranking.get(left, 1) >= ranking.get(right, 1) else right
|
||||
@@ -25,6 +25,7 @@ from repo_registry.core.models import (
|
||||
SearchResult,
|
||||
)
|
||||
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
|
||||
from repo_registry.candidate_graph.normalization import normalize_candidate_drafts
|
||||
from repo_registry.content_indexing.extractor import ContentExtractor
|
||||
from repo_registry.core.logging import log_operation
|
||||
from repo_registry.llm_extraction.extractor import LLMCandidateExtractor
|
||||
@@ -211,6 +212,7 @@ class RegistryService:
|
||||
stored_chunks,
|
||||
)
|
||||
candidate_source = "deterministic"
|
||||
candidates = normalize_candidate_drafts(candidates)
|
||||
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
|
||||
if candidate_source == "llm":
|
||||
log_operation(
|
||||
|
||||
89
tests/test_candidate_normalization.py
Normal file
89
tests/test_candidate_normalization.py
Normal file
@@ -0,0 +1,89 @@
|
||||
from repo_registry.candidate_graph.generator import (
|
||||
CandidateAbilityDraft,
|
||||
CandidateCapabilityDraft,
|
||||
CandidateFeatureDraft,
|
||||
)
|
||||
from repo_registry.candidate_graph.normalization import normalize_candidate_drafts
|
||||
from repo_registry.core.models import SourceReference
|
||||
|
||||
|
||||
def ref(fact_id, path):
|
||||
return SourceReference(
|
||||
fact_id=fact_id,
|
||||
path=path,
|
||||
kind="documentation",
|
||||
name=path,
|
||||
)
|
||||
|
||||
|
||||
def test_normalizer_merges_duplicate_abilities_and_nested_claims():
|
||||
candidates = [
|
||||
CandidateAbilityDraft(
|
||||
name="LLM Provider Integration",
|
||||
description="Connects to model providers.",
|
||||
confidence=0.55,
|
||||
source_refs=[ref(1, "README.md")],
|
||||
capabilities=[
|
||||
CandidateCapabilityDraft(
|
||||
name="Use OpenRouter Models",
|
||||
description="Calls OpenRouter.",
|
||||
inputs=["prompt"],
|
||||
outputs=["response"],
|
||||
confidence=0.6,
|
||||
source_refs=[ref(2, "providers.py")],
|
||||
features=[
|
||||
CandidateFeatureDraft(
|
||||
name="OpenRouter Model Support",
|
||||
type="integration",
|
||||
location="providers.py",
|
||||
confidence=0.6,
|
||||
source_refs=[ref(2, "providers.py")],
|
||||
)
|
||||
],
|
||||
)
|
||||
],
|
||||
),
|
||||
CandidateAbilityDraft(
|
||||
name="LLM Provider Integrations",
|
||||
description="Connects prompts to OpenRouter and Claude providers.",
|
||||
confidence=0.7,
|
||||
source_refs=[ref(3, "providers.py")],
|
||||
capabilities=[
|
||||
CandidateCapabilityDraft(
|
||||
name="OpenRouter Model Support",
|
||||
description="Supports OpenRouter model calls.",
|
||||
inputs=["LLM request"],
|
||||
outputs=["model response"],
|
||||
confidence=0.75,
|
||||
source_refs=[ref(3, "providers.py")],
|
||||
features=[
|
||||
CandidateFeatureDraft(
|
||||
name="Use OpenRouter Models",
|
||||
type="backend integration",
|
||||
location="src/providers.py",
|
||||
confidence=0.75,
|
||||
source_refs=[ref(3, "providers.py")],
|
||||
)
|
||||
],
|
||||
)
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
normalized = normalize_candidate_drafts(candidates)
|
||||
|
||||
assert len(normalized) == 1
|
||||
ability = normalized[0]
|
||||
assert ability.name == "LLM Provider Integrations"
|
||||
assert ability.description == (
|
||||
"Connects prompts to OpenRouter and Claude providers."
|
||||
)
|
||||
assert ability.confidence == 0.7
|
||||
assert {ref.fact_id for ref in ability.source_refs} == {1, 3}
|
||||
assert len(ability.capabilities) == 1
|
||||
capability = ability.capabilities[0]
|
||||
assert capability.confidence == 0.75
|
||||
assert capability.inputs == ["prompt", "LLM request"]
|
||||
assert capability.outputs == ["response", "model response"]
|
||||
assert len(capability.features) == 1
|
||||
assert capability.features[0].confidence == 0.75
|
||||
@@ -641,6 +641,43 @@ def test_analyze_repository_can_disable_optional_llm_extractor(tmp_path):
|
||||
assert all(decision.action != "llm_extraction_used" for decision in decisions)
|
||||
|
||||
|
||||
def test_analyze_repository_normalizes_duplicate_llm_candidates(tmp_path):
|
||||
source = tmp_path / "repo"
|
||||
source.mkdir()
|
||||
(source / "README.md").write_text(
|
||||
"# LLM Connect\nSupports OpenRouter providers.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
store = RegistryStore(tmp_path / "registry.sqlite3")
|
||||
store.initialize()
|
||||
extractor = FakeLLMExtractor(
|
||||
[
|
||||
ExtractedAbility(
|
||||
name="LLM Provider Integration",
|
||||
description="Connects to model providers.",
|
||||
source_paths=["README.md"],
|
||||
),
|
||||
ExtractedAbility(
|
||||
name="LLM Provider Integrations",
|
||||
description="Connects prompts to OpenRouter providers.",
|
||||
source_paths=["README.md"],
|
||||
),
|
||||
]
|
||||
)
|
||||
service = RegistryService(
|
||||
store,
|
||||
ingestion=GitIngestionService(tmp_path / "checkouts"),
|
||||
llm_extractor=extractor,
|
||||
)
|
||||
repository = service.register_repository(name="LLM Connect", url=str(source))
|
||||
|
||||
summary = service.analyze_repository(repository.id)
|
||||
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
|
||||
|
||||
assert len(graph.abilities) == 1
|
||||
assert graph.abilities[0].name == "LLM Provider Integrations"
|
||||
|
||||
|
||||
def test_analyze_repository_falls_back_when_optional_llm_extractor_returns_no_candidates(tmp_path):
|
||||
source = tmp_path / "repo"
|
||||
source.mkdir()
|
||||
|
||||
@@ -196,3 +196,8 @@ Acceptance: the repository has at least one expectation fixture for an LLM
|
||||
infrastructure repo and a test that fails if deterministic analysis stops
|
||||
surfacing expected provider concepts. The workflow remains LLM-optional: LLMs may
|
||||
suggest expectations, but deterministic tests encode the accepted learning.
|
||||
|
||||
Follow-up hardening completed 2026-04-29: candidate graphs are normalized before
|
||||
storage so duplicate or overlapping LLM/deterministic claims merge into one
|
||||
review item while preserving stronger descriptions, confidence, source refs, and
|
||||
nested capabilities/features/evidence.
|
||||
|
||||
Reference in New Issue
Block a user