Improved datamodel and deterministic generation

This commit is contained in:
2026-04-30 01:29:29 +02:00
parent 973d4bbe7c
commit 26e87ab52c
14 changed files with 848 additions and 39 deletions

View File

@@ -21,6 +21,8 @@ class CandidateFeatureDraft:
location: str
confidence: float
source_refs: list[SourceReference]
primary_class: str = ""
attributes: list[str] = field(default_factory=list)
@dataclass(frozen=True)
@@ -31,6 +33,8 @@ class CandidateCapabilityDraft:
outputs: list[str]
confidence: float
source_refs: list[SourceReference]
primary_class: str = "capability"
attributes: list[str] = field(default_factory=list)
features: list[CandidateFeatureDraft] = field(default_factory=list)
evidence: list[CandidateEvidenceDraft] = field(default_factory=list)
@@ -41,6 +45,8 @@ class CandidateAbilityDraft:
description: str
confidence: float
source_refs: list[SourceReference]
primary_class: str = "ability"
attributes: list[str] = field(default_factory=list)
capabilities: list[CandidateCapabilityDraft] = field(default_factory=list)
@@ -68,6 +74,11 @@ class CandidateGraphGenerator:
credential_configs = self._facts(facts, "credential_config")
provider_registries = self._facts(facts, "provider_registry")
fallback_policies = self._facts(facts, "fallback_policy")
ability_primary_class, ability_attributes = self._ability_classification(
repository,
facts,
chunks,
)
ability_sources = docs or manifests or languages
ability = CandidateAbilityDraft(
@@ -82,6 +93,8 @@ class CandidateGraphGenerator:
languages=languages,
),
source_refs=self._source_refs(ability_sources),
primary_class=ability_primary_class,
attributes=ability_attributes,
capabilities=[],
)
@@ -119,6 +132,12 @@ class CandidateGraphGenerator:
docs=docs,
),
source_refs=self._source_refs(manifests + frameworks + languages),
primary_class="repository-structure",
attributes=self._structure_attributes(
manifests,
frameworks,
languages,
),
evidence=self._evidence(tests, examples, docs),
)
)
@@ -129,6 +148,8 @@ class CandidateGraphGenerator:
description=ability.description,
confidence=ability.confidence,
source_refs=ability.source_refs,
primary_class=ability.primary_class,
attributes=ability.attributes,
capabilities=capabilities,
)
]
@@ -154,6 +175,8 @@ class CandidateGraphGenerator:
docs=docs,
),
source_refs=self._source_refs(interfaces),
primary_class="interface",
attributes=self._interface_attributes(interfaces),
features=features,
evidence=self._evidence(tests, examples, docs),
)
@@ -181,6 +204,8 @@ class CandidateGraphGenerator:
source_refs=self._source_refs(
[fact for fact in providers if fact.name == provider]
),
primary_class="integration",
attributes=["llm-provider", provider.lower()],
)
for provider in provider_names
]
@@ -192,6 +217,8 @@ class CandidateGraphGenerator:
location=self._grouped_location(credentials),
confidence=0.7,
source_refs=self._source_refs(credentials),
primary_class="configuration",
attributes=["credential", "llm-provider"],
)
)
if registries:
@@ -202,6 +229,8 @@ class CandidateGraphGenerator:
location=self._grouped_location(registries),
confidence=0.65,
source_refs=self._source_refs(registries),
primary_class="backend",
attributes=["provider-registry", "llm-provider"],
)
)
if fallback_policies:
@@ -212,6 +241,8 @@ class CandidateGraphGenerator:
location=self._grouped_location(fallback_policies),
confidence=0.6,
source_refs=self._source_refs(fallback_policies),
primary_class="backend",
attributes=["fallback-policy", "llm-provider"],
)
)
return CandidateCapabilityDraft(
@@ -232,6 +263,13 @@ class CandidateGraphGenerator:
source_refs=self._source_refs(
providers + credentials + registries + fallback_policies
),
primary_class="llm-integration",
attributes=self._llm_provider_attributes(
providers,
credentials,
registries,
fallback_policies,
),
features=features,
evidence=self._evidence(tests, examples, docs),
)
@@ -256,6 +294,8 @@ class CandidateGraphGenerator:
location=fact.path,
confidence=0.65 if fact.value else 0.45,
source_refs=self._source_refs([fact]),
primary_class=feature_type,
attributes=self._feature_attributes(feature_type, [fact]),
)
)
continue
@@ -271,6 +311,8 @@ class CandidateGraphGenerator:
location=self._grouped_location(facts),
confidence=self._grouped_interface_confidence(facts),
source_refs=self._source_refs(facts),
primary_class=feature_type,
attributes=self._feature_attributes(feature_type, facts),
)
)
return features
@@ -357,6 +399,96 @@ class CandidateGraphGenerator:
return "API"
return "interface"
def _ability_classification(
self,
repository: Repository,
facts: list[ObservedFact],
chunks: list[ContentChunk],
) -> tuple[str, list[str]]:
text = " ".join(
[
repository.name,
repository.description or "",
" ".join(chunk.text[:600] for chunk in chunks if chunk.kind == "documentation"),
" ".join(f"{fact.kind} {fact.name} {fact.value}" for fact in facts),
]
).lower()
attributes: list[str] = []
if any(token in text for token in ("repository", "repo", "registry")):
attributes.append("repository")
if any(token in text for token in ("ability", "capability", "feature")):
return "repository-intelligence", self._unique(attributes + ["capability-mapping"])
if any(token in text for token in ("llm", "openrouter", "claude", "model provider")):
return "ai-integration", self._unique(attributes + ["llm-provider"])
if any(fact.kind == "interface" for fact in facts):
attributes.append("interface")
return "developer-tooling", self._unique(attributes)
def _interface_attributes(self, interfaces: list[ObservedFact]) -> list[str]:
feature_types = {self._feature_type(fact) for fact in interfaces}
attributes = ["api" if item == "API" else "cli" if item == "CLI" else "callable" for item in feature_types]
return self._unique(["surface", *attributes])
def _feature_attributes(
self,
feature_type: str,
facts: list[ObservedFact],
) -> list[str]:
attributes = [feature_type]
if feature_type == "API":
attributes.extend(["surface", "http"])
elif feature_type == "CLI":
attributes.extend(["surface", "command"])
else:
attributes.append("surface")
paths = " ".join(fact.path.lower() for fact in facts)
if "test" in paths:
attributes.append("test-linked")
return self._unique(attributes)
def _structure_attributes(
self,
manifests: list[ObservedFact],
frameworks: list[ObservedFact],
languages: list[ObservedFact],
) -> list[str]:
return self._unique(
[
"manifest" if manifests else "",
*[fact.name for fact in frameworks],
*[fact.name for fact in languages],
]
)
def _llm_provider_attributes(
self,
providers: list[ObservedFact],
credentials: list[ObservedFact],
registries: list[ObservedFact],
fallback_policies: list[ObservedFact],
) -> list[str]:
return self._unique(
[
"llm-provider",
*[fact.name.lower() for fact in providers],
"credential" if credentials else "",
"provider-registry" if registries else "",
"fallback-policy" if fallback_policies else "",
]
)
def _unique(self, values: list[str]) -> list[str]:
result: list[str] = []
seen: set[str] = set()
for value in values:
item = value.strip()
key = item.lower()
if not item or key in seen:
continue
seen.add(key)
result.append(item)
return result
def _interface_inputs(self, interfaces: list[ObservedFact]) -> list[str]:
feature_types = {self._feature_type(fact) for fact in interfaces}
inputs: list[str] = []

View File

@@ -73,6 +73,8 @@ def _combine_abilities(
description=_preferred_description(left.description, right.description),
confidence=max(left.confidence, right.confidence),
source_refs=_merge_source_refs(left.source_refs, right.source_refs),
primary_class=_preferred_text(left.primary_class, right.primary_class),
attributes=_merge_strings(left.attributes, right.attributes),
capabilities=_merge_capabilities(left.capabilities + right.capabilities),
)
@@ -107,6 +109,8 @@ def _combine_capabilities(
outputs=_merge_strings(left.outputs, right.outputs),
confidence=max(left.confidence, right.confidence),
source_refs=_merge_source_refs(left.source_refs, right.source_refs),
primary_class=_preferred_text(left.primary_class, right.primary_class),
attributes=_merge_strings(left.attributes, right.attributes),
features=_merge_features(left.features + right.features),
evidence=_merge_evidence(left.evidence + right.evidence),
)
@@ -128,6 +132,8 @@ def _merge_features(
location=_preferred_text(existing.location, feature.location),
confidence=max(existing.confidence, feature.confidence),
source_refs=_merge_source_refs(existing.source_refs, feature.source_refs),
primary_class=_preferred_text(existing.primary_class, feature.primary_class),
attributes=_merge_strings(existing.attributes, feature.attributes),
)
return merged