generated from coulomb/repo-seed
chore(consistency): sync task status from DB [auto]
Updated by fix-consistency on 2026-05-15: - update .custodian-brief.md for repo-scoping
This commit is contained in:
1
src/repo_scoping/candidate_graph/__init__.py
Normal file
1
src/repo_scoping/candidate_graph/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Candidate ability graph generation."""
|
||||
1411
src/repo_scoping/candidate_graph/generator.py
Normal file
1411
src/repo_scoping/candidate_graph/generator.py
Normal file
File diff suppressed because it is too large
Load Diff
273
src/repo_scoping/candidate_graph/normalization.py
Normal file
273
src/repo_scoping/candidate_graph/normalization.py
Normal file
@@ -0,0 +1,273 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import replace
|
||||
|
||||
from repo_registry.candidate_graph.generator import (
|
||||
CandidateAbilityDraft,
|
||||
CandidateCapabilityDraft,
|
||||
CandidateEvidenceDraft,
|
||||
CandidateFeatureDraft,
|
||||
)
|
||||
from repo_registry.core.models import SourceReference
|
||||
|
||||
|
||||
STOP_WORDS = {
|
||||
"a",
|
||||
"an",
|
||||
"and",
|
||||
"capability",
|
||||
"feature",
|
||||
"for",
|
||||
"models",
|
||||
"model",
|
||||
"of",
|
||||
"support",
|
||||
"supports",
|
||||
"the",
|
||||
"to",
|
||||
"use",
|
||||
"uses",
|
||||
"using",
|
||||
}
|
||||
|
||||
DISTINCTIVE_TOKENS = {
|
||||
"anthropic",
|
||||
"claude",
|
||||
"gemini",
|
||||
"openai",
|
||||
"openrouter",
|
||||
}
|
||||
|
||||
|
||||
def normalize_candidate_drafts(
|
||||
abilities: list[CandidateAbilityDraft],
|
||||
) -> list[CandidateAbilityDraft]:
|
||||
return _merge_abilities(abilities)
|
||||
|
||||
|
||||
def _merge_abilities(
|
||||
abilities: list[CandidateAbilityDraft],
|
||||
) -> list[CandidateAbilityDraft]:
|
||||
merged: list[CandidateAbilityDraft] = []
|
||||
for ability in abilities:
|
||||
index = _find_overlap(merged, ability.name)
|
||||
if index is None:
|
||||
merged.append(
|
||||
replace(
|
||||
ability,
|
||||
capabilities=_merge_capabilities(ability.capabilities),
|
||||
)
|
||||
)
|
||||
continue
|
||||
merged[index] = _combine_abilities(merged[index], ability)
|
||||
return merged
|
||||
|
||||
|
||||
def _combine_abilities(
|
||||
left: CandidateAbilityDraft,
|
||||
right: CandidateAbilityDraft,
|
||||
) -> CandidateAbilityDraft:
|
||||
return CandidateAbilityDraft(
|
||||
name=_preferred_name(left.name, right.name),
|
||||
description=_preferred_description(left.description, right.description),
|
||||
confidence=max(left.confidence, right.confidence),
|
||||
source_refs=_merge_source_refs(left.source_refs, right.source_refs),
|
||||
primary_class=_preferred_text(left.primary_class, right.primary_class),
|
||||
attributes=_merge_strings(left.attributes, right.attributes),
|
||||
capabilities=_merge_capabilities(left.capabilities + right.capabilities),
|
||||
)
|
||||
|
||||
|
||||
def _merge_capabilities(
|
||||
capabilities: list[CandidateCapabilityDraft],
|
||||
) -> list[CandidateCapabilityDraft]:
|
||||
merged: list[CandidateCapabilityDraft] = []
|
||||
for capability in capabilities:
|
||||
index = _find_overlap(merged, capability.name)
|
||||
if index is None:
|
||||
merged.append(
|
||||
replace(
|
||||
capability,
|
||||
features=_merge_features(capability.features),
|
||||
evidence=_merge_evidence(capability.evidence),
|
||||
)
|
||||
)
|
||||
continue
|
||||
merged[index] = _combine_capabilities(merged[index], capability)
|
||||
return merged
|
||||
|
||||
|
||||
def _combine_capabilities(
|
||||
left: CandidateCapabilityDraft,
|
||||
right: CandidateCapabilityDraft,
|
||||
) -> CandidateCapabilityDraft:
|
||||
return CandidateCapabilityDraft(
|
||||
name=_preferred_name(left.name, right.name),
|
||||
description=_preferred_description(left.description, right.description),
|
||||
inputs=_merge_strings(left.inputs, right.inputs),
|
||||
outputs=_merge_strings(left.outputs, right.outputs),
|
||||
confidence=max(left.confidence, right.confidence),
|
||||
source_refs=_merge_source_refs(left.source_refs, right.source_refs),
|
||||
primary_class=_preferred_text(left.primary_class, right.primary_class),
|
||||
attributes=_merge_strings(left.attributes, right.attributes),
|
||||
features=_merge_features(left.features + right.features),
|
||||
evidence=_merge_evidence(left.evidence + right.evidence),
|
||||
)
|
||||
|
||||
|
||||
def _merge_features(
|
||||
features: list[CandidateFeatureDraft],
|
||||
) -> list[CandidateFeatureDraft]:
|
||||
merged: list[CandidateFeatureDraft] = []
|
||||
for feature in features:
|
||||
index = _find_overlap(merged, feature.name)
|
||||
if index is None:
|
||||
merged.append(feature)
|
||||
continue
|
||||
existing = merged[index]
|
||||
merged[index] = CandidateFeatureDraft(
|
||||
name=_preferred_name(existing.name, feature.name),
|
||||
type=_preferred_text(existing.type, feature.type),
|
||||
location=_preferred_text(existing.location, feature.location),
|
||||
confidence=max(existing.confidence, feature.confidence),
|
||||
source_refs=_merge_source_refs(existing.source_refs, feature.source_refs),
|
||||
primary_class=_preferred_text(existing.primary_class, feature.primary_class),
|
||||
attributes=_merge_strings(existing.attributes, feature.attributes),
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def _merge_evidence(
|
||||
evidence_items: list[CandidateEvidenceDraft],
|
||||
) -> list[CandidateEvidenceDraft]:
|
||||
merged: list[CandidateEvidenceDraft] = []
|
||||
seen: set[tuple[str, str]] = set()
|
||||
for evidence in evidence_items:
|
||||
key = (_normalize_text(evidence.type), _normalize_path(evidence.reference))
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
merged.append(evidence)
|
||||
continue
|
||||
index = next(
|
||||
index
|
||||
for index, item in enumerate(merged)
|
||||
if (_normalize_text(item.type), _normalize_path(item.reference)) == key
|
||||
)
|
||||
existing = merged[index]
|
||||
merged[index] = CandidateEvidenceDraft(
|
||||
type=_preferred_text(existing.type, evidence.type),
|
||||
reference=_preferred_text(existing.reference, evidence.reference),
|
||||
strength=_stronger_evidence(existing.strength, evidence.strength),
|
||||
source_refs=_merge_source_refs(existing.source_refs, evidence.source_refs),
|
||||
)
|
||||
return merged
|
||||
|
||||
|
||||
def _find_overlap(items: list, name: str) -> int | None:
|
||||
for index, item in enumerate(items):
|
||||
if _names_overlap(item.name, name):
|
||||
return index
|
||||
return None
|
||||
|
||||
|
||||
def _names_overlap(left: str, right: str) -> bool:
|
||||
if _normalize_text(left) == _normalize_text(right):
|
||||
return True
|
||||
left_tokens = _tokens(left)
|
||||
right_tokens = _tokens(right)
|
||||
if not left_tokens or not right_tokens:
|
||||
return False
|
||||
intersection = left_tokens & right_tokens
|
||||
union = left_tokens | right_tokens
|
||||
overlap = len(intersection) / len(union)
|
||||
containment = len(intersection) / min(len(left_tokens), len(right_tokens))
|
||||
if intersection & DISTINCTIVE_TOKENS and containment >= 0.8:
|
||||
return True
|
||||
return overlap >= 0.6 or (containment >= 0.8 and len(intersection) >= 2)
|
||||
|
||||
|
||||
def _tokens(value: str) -> set[str]:
|
||||
return {
|
||||
_stem(token)
|
||||
for token in re.findall(r"[a-z0-9]+", value.lower())
|
||||
if token not in STOP_WORDS
|
||||
}
|
||||
|
||||
|
||||
def _stem(token: str) -> str:
|
||||
if token.endswith("ies") and len(token) > 4:
|
||||
return f"{token[:-3]}y"
|
||||
if token.endswith("s") and len(token) > 3:
|
||||
return token[:-1]
|
||||
return token
|
||||
|
||||
|
||||
def _normalize_text(value: str) -> str:
|
||||
return " ".join(sorted(_tokens(value)))
|
||||
|
||||
|
||||
def _normalize_path(value: str) -> str:
|
||||
return value.strip().lower()
|
||||
|
||||
|
||||
def _preferred_name(left: str, right: str) -> str:
|
||||
return _preferred_text(left, right)
|
||||
|
||||
|
||||
def _preferred_description(left: str, right: str) -> str:
|
||||
if not left.strip():
|
||||
return right.strip()
|
||||
if not right.strip():
|
||||
return left.strip()
|
||||
if _normalize_sentence(left) == _normalize_sentence(right):
|
||||
return max((left.strip(), right.strip()), key=len)
|
||||
return max((left.strip(), right.strip()), key=len)
|
||||
|
||||
|
||||
def _normalize_sentence(value: str) -> str:
|
||||
return re.sub(r"\s+", " ", value.strip().lower())
|
||||
|
||||
|
||||
def _preferred_text(left: str, right: str) -> str:
|
||||
if not left:
|
||||
return right
|
||||
if not right:
|
||||
return left
|
||||
return max((left, right), key=lambda item: (len(_tokens(item)), len(item)))
|
||||
|
||||
|
||||
def _merge_strings(left: list[str], right: list[str]) -> list[str]:
|
||||
merged: list[str] = []
|
||||
seen: set[str] = set()
|
||||
for value in left + right:
|
||||
key = _normalize_value(value)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
merged.append(value)
|
||||
return merged
|
||||
|
||||
|
||||
def _normalize_value(value: str) -> str:
|
||||
return " ".join(re.findall(r"[a-z0-9]+", value.lower()))
|
||||
|
||||
|
||||
def _merge_source_refs(
|
||||
left: list[SourceReference],
|
||||
right: list[SourceReference],
|
||||
) -> list[SourceReference]:
|
||||
merged: list[SourceReference] = []
|
||||
seen: set[tuple[int | None, str, str, str, int | None]] = set()
|
||||
for ref in left + right:
|
||||
key = (ref.fact_id, ref.path, ref.kind, ref.name, ref.line)
|
||||
if key in seen:
|
||||
continue
|
||||
seen.add(key)
|
||||
merged.append(ref)
|
||||
return merged
|
||||
|
||||
|
||||
def _stronger_evidence(left: str, right: str) -> str:
|
||||
ranking = {"weak": 0, "medium": 1, "strong": 2}
|
||||
return left if ranking.get(left, 1) >= ranking.get(right, 1) else right
|
||||
Reference in New Issue
Block a user