generated from coulomb/repo-seed
Ability naming builds on INTENT.md
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass, field, replace
|
||||
|
||||
from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference
|
||||
|
||||
@@ -123,7 +123,13 @@ class CandidateGraphGenerator:
|
||||
docs,
|
||||
)
|
||||
)
|
||||
if interfaces and not capabilities:
|
||||
if interfaces and capabilities:
|
||||
capabilities = self._attach_interface_features(
|
||||
capabilities,
|
||||
interfaces,
|
||||
chunks,
|
||||
)
|
||||
elif interfaces:
|
||||
capabilities.append(
|
||||
self._interface_capability(interfaces, tests, examples, docs, chunks)
|
||||
)
|
||||
@@ -362,6 +368,71 @@ class CandidateGraphGenerator:
|
||||
words.pop()
|
||||
return self._title_from_words(words[:10])
|
||||
|
||||
def _attach_interface_features(
|
||||
self,
|
||||
capabilities: list[CandidateCapabilityDraft],
|
||||
interfaces: list[ObservedFact],
|
||||
chunks: list[ContentChunk],
|
||||
) -> list[CandidateCapabilityDraft]:
|
||||
features = self._interface_features(interfaces, chunks)
|
||||
if not features:
|
||||
return capabilities
|
||||
capability_features: dict[int, list[CandidateFeatureDraft]] = {
|
||||
index: [] for index, _ in enumerate(capabilities)
|
||||
}
|
||||
for feature in features:
|
||||
index = self._best_feature_capability_index(feature, capabilities)
|
||||
capability_features[index].append(feature)
|
||||
|
||||
updated: list[CandidateCapabilityDraft] = []
|
||||
for index, capability in enumerate(capabilities):
|
||||
attached = capability_features[index]
|
||||
if not attached:
|
||||
updated.append(capability)
|
||||
continue
|
||||
updated.append(
|
||||
replace(
|
||||
capability,
|
||||
inputs=capability.inputs or self._feature_inputs(attached),
|
||||
outputs=capability.outputs or self._feature_outputs(attached),
|
||||
features=[*capability.features, *attached],
|
||||
)
|
||||
)
|
||||
return updated
|
||||
|
||||
def _best_feature_capability_index(
|
||||
self,
|
||||
feature: CandidateFeatureDraft,
|
||||
capabilities: list[CandidateCapabilityDraft],
|
||||
) -> int:
|
||||
feature_text = f"{feature.name} {feature.type} {feature.location}".lower()
|
||||
feature_terms = self._significant_terms(feature_text)
|
||||
best_index = 0
|
||||
best_score = -1
|
||||
for index, capability in enumerate(capabilities):
|
||||
capability_text = " ".join(
|
||||
[
|
||||
capability.name,
|
||||
capability.description,
|
||||
" ".join(capability.outputs),
|
||||
" ".join(capability.attributes),
|
||||
]
|
||||
).lower()
|
||||
capability_terms = self._significant_terms(capability_text)
|
||||
score = len(feature_terms & capability_terms)
|
||||
if feature.type == "CLI" and any(
|
||||
token in capability_text for token in ("cli", "command", "mcp")
|
||||
):
|
||||
score += 3
|
||||
if feature.type == "API" and any(
|
||||
token in capability_text for token in ("api", "http", "service")
|
||||
):
|
||||
score += 3
|
||||
if score > best_score:
|
||||
best_index = index
|
||||
best_score = score
|
||||
return best_index
|
||||
|
||||
def _interface_features(
|
||||
self,
|
||||
interfaces: list[ObservedFact],
|
||||
@@ -424,7 +495,7 @@ class CandidateGraphGenerator:
|
||||
chunks: list[ContentChunk],
|
||||
) -> str:
|
||||
names = [self._feature_name(fact, chunks) for fact in facts]
|
||||
compact_names = [name for name in names if name]
|
||||
compact_names = self._unique([name for name in names if name])
|
||||
if not compact_names:
|
||||
return f"{len(facts)} entry points"
|
||||
visible = compact_names[:3]
|
||||
@@ -623,6 +694,26 @@ class CandidateGraphGenerator:
|
||||
result.append(item)
|
||||
return result
|
||||
|
||||
def _significant_terms(self, text: str) -> set[str]:
|
||||
stop_words = {
|
||||
"and",
|
||||
"the",
|
||||
"this",
|
||||
"that",
|
||||
"with",
|
||||
"from",
|
||||
"into",
|
||||
"for",
|
||||
"capability",
|
||||
"repository",
|
||||
"service",
|
||||
}
|
||||
return {
|
||||
term
|
||||
for term in re.findall(r"[a-z0-9]+", text.lower())
|
||||
if len(term) > 2 and term not in stop_words
|
||||
}
|
||||
|
||||
def _interface_inputs(self, interfaces: list[ObservedFact]) -> list[str]:
|
||||
feature_types = {self._feature_type(fact) for fact in interfaces}
|
||||
inputs: list[str] = []
|
||||
@@ -645,6 +736,28 @@ class CandidateGraphGenerator:
|
||||
outputs.append("callable interface result")
|
||||
return outputs
|
||||
|
||||
def _feature_inputs(self, features: list[CandidateFeatureDraft]) -> list[str]:
|
||||
feature_types = {feature.type for feature in features}
|
||||
inputs: list[str] = []
|
||||
if "API" in feature_types:
|
||||
inputs.append("HTTP request")
|
||||
if "CLI" in feature_types:
|
||||
inputs.append("CLI arguments")
|
||||
if not inputs:
|
||||
inputs.append("caller input")
|
||||
return inputs
|
||||
|
||||
def _feature_outputs(self, features: list[CandidateFeatureDraft]) -> list[str]:
|
||||
feature_types = {feature.type for feature in features}
|
||||
outputs: list[str] = []
|
||||
if "API" in feature_types:
|
||||
outputs.append("HTTP response")
|
||||
if "CLI" in feature_types:
|
||||
outputs.append("command output")
|
||||
if not outputs:
|
||||
outputs.append("callable interface result")
|
||||
return outputs
|
||||
|
||||
def _feature_name(self, fact: ObservedFact, chunks: list[ContentChunk]) -> str:
|
||||
route_name = self._route_feature_name(fact.value)
|
||||
if route_name:
|
||||
@@ -797,7 +910,7 @@ class CandidateGraphGenerator:
|
||||
return f"Support {self._humanize_identifier(repository.name)}"
|
||||
|
||||
def _document_purpose_sentence(self, chunks: list[ContentChunk]) -> str:
|
||||
for chunk in self._documentation_chunks(chunks):
|
||||
for chunk in self._purpose_chunks(chunks):
|
||||
if chunk.kind not in {"intent", "documentation"}:
|
||||
continue
|
||||
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
|
||||
@@ -806,6 +919,28 @@ class CandidateGraphGenerator:
|
||||
return paragraph
|
||||
return ""
|
||||
|
||||
def _purpose_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]:
|
||||
def priority(chunk: ContentChunk) -> tuple[int, str, int]:
|
||||
role = chunk.metadata.get("source_role")
|
||||
path = chunk.path.lower()
|
||||
if role == "intent_summary" or path.endswith("intent.md"):
|
||||
return (0, path, chunk.start_line)
|
||||
if role == "product_documentation" or path.startswith("readme"):
|
||||
return (1, path, chunk.start_line)
|
||||
if role == "derived_scope" or path.endswith("scope.md"):
|
||||
return (3, path, chunk.start_line)
|
||||
return (2, path, chunk.start_line)
|
||||
|
||||
return sorted(
|
||||
[
|
||||
chunk
|
||||
for chunk in chunks
|
||||
if chunk.kind in {"intent", "documentation"}
|
||||
and chunk.metadata.get("source_role") != "agent_guidance"
|
||||
],
|
||||
key=priority,
|
||||
)
|
||||
|
||||
def _operations_ability_name(self, chunks: list[ContentChunk]) -> str:
|
||||
text = " ".join(
|
||||
chunk.text
|
||||
|
||||
@@ -268,9 +268,50 @@ class RegistryService:
|
||||
extracted = self.llm_extractor.extract(repository, chunks)
|
||||
if extracted:
|
||||
llm_candidates = self.llm_mapper.map(extracted, facts, chunks)
|
||||
return llm_candidates + deterministic, "llm+deterministic"
|
||||
return (
|
||||
self._merge_llm_candidates(llm_candidates, deterministic),
|
||||
"llm+deterministic",
|
||||
)
|
||||
return deterministic, "deterministic"
|
||||
|
||||
def _merge_llm_candidates(
|
||||
self,
|
||||
llm_candidates: list,
|
||||
deterministic: list,
|
||||
) -> list:
|
||||
if not deterministic:
|
||||
return [
|
||||
ability
|
||||
for ability in llm_candidates
|
||||
if self._candidate_ability_has_trusted_sources(ability)
|
||||
]
|
||||
|
||||
merged_deterministic = list(deterministic)
|
||||
trusted_llm = []
|
||||
folded_capabilities = []
|
||||
for ability in llm_candidates:
|
||||
if self._candidate_ability_has_trusted_sources(ability):
|
||||
trusted_llm.append(ability)
|
||||
else:
|
||||
folded_capabilities.extend(ability.capabilities)
|
||||
|
||||
if folded_capabilities:
|
||||
target = merged_deterministic[0]
|
||||
merged_deterministic[0] = replace(
|
||||
target,
|
||||
capabilities=[*target.capabilities, *folded_capabilities],
|
||||
)
|
||||
return [*trusted_llm, *merged_deterministic]
|
||||
|
||||
def _candidate_ability_has_trusted_sources(self, ability) -> bool:
|
||||
if not ability.source_refs:
|
||||
return False
|
||||
return any(
|
||||
ref.kind in {"intent", "documentation", "interface", "test", "example"}
|
||||
and not ref.path.lower().endswith("scope.md")
|
||||
for ref in ability.source_refs
|
||||
)
|
||||
|
||||
def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]:
|
||||
return self.store.list_analysis_runs(repository_id)
|
||||
|
||||
|
||||
@@ -135,6 +135,53 @@ def test_candidate_generator_extracts_intended_capability_blocks_from_intent_chu
|
||||
assert [ref.path for ref in intent_capability.source_refs] == ["INTENT.md"]
|
||||
|
||||
|
||||
def test_candidate_generator_prefers_intent_over_derived_scope_for_ability_name():
|
||||
repository = Repository(
|
||||
id=1,
|
||||
name="LLMConnect",
|
||||
url="/tmp/llm-connect",
|
||||
description=None,
|
||||
branch="main",
|
||||
status="analyzed",
|
||||
)
|
||||
facts = [
|
||||
fact(
|
||||
1,
|
||||
"intent",
|
||||
"INTENT",
|
||||
"INTENT.md",
|
||||
metadata={"source_role": "intent_summary"},
|
||||
),
|
||||
fact(
|
||||
2,
|
||||
"documentation",
|
||||
"SCOPE",
|
||||
"SCOPE.md",
|
||||
metadata={"source_role": "derived_scope"},
|
||||
),
|
||||
]
|
||||
chunks = [
|
||||
chunk(
|
||||
1,
|
||||
"documentation",
|
||||
"SCOPE.md",
|
||||
"# SCOPE\n\nA stale first paragraph copied from another repository.",
|
||||
),
|
||||
chunk(
|
||||
2,
|
||||
"intent",
|
||||
"INTENT.md",
|
||||
"# INTENT\n\nProvide a provider-agnostic LLM connector.",
|
||||
),
|
||||
]
|
||||
chunks[0].metadata["source_role"] = "derived_scope"
|
||||
chunks[1].metadata["source_role"] = "intent_summary"
|
||||
|
||||
graph = CandidateGraphGenerator().generate(repository, facts, chunks)
|
||||
|
||||
assert graph[0].name == "Provide A Provider-agnostic LLM Connector"
|
||||
|
||||
|
||||
def test_candidate_generator_enriches_descriptions_from_content_chunks():
|
||||
repository = Repository(
|
||||
id=1,
|
||||
|
||||
@@ -4,7 +4,11 @@ import subprocess
|
||||
|
||||
from repo_registry.core.logging import LOGGER_NAME
|
||||
from repo_registry.core.service import RegistryService
|
||||
from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability
|
||||
from repo_registry.llm_extraction import (
|
||||
ExtractedAbility,
|
||||
ExtractedCapability,
|
||||
ExtractedFeature,
|
||||
)
|
||||
from repo_registry.repo_ingestion.git import GitIngestionService
|
||||
from repo_registry.semantic import HashingEmbeddingProvider
|
||||
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
|
||||
@@ -522,6 +526,15 @@ def test_regression_ops_bridge_like_repo_is_it_operations_not_llm_provider(tmp_p
|
||||
assert "Maintain Continuous Connectivity Between Remote Systems And Central Hub" in capability_names
|
||||
assert "Make Connectivity Observable Auditable And Controllable" in capability_names
|
||||
assert "Expose CLI And MCP Accessible Service" in capability_names
|
||||
cli_capability = next(
|
||||
capability
|
||||
for candidate_ability in graph.abilities
|
||||
for capability in candidate_ability.capabilities
|
||||
if capability.name == "Expose CLI And MCP Accessible Service"
|
||||
)
|
||||
assert {feature.name for feature in cli_capability.features} == {
|
||||
"CLI command surface: CLI command up"
|
||||
}
|
||||
assert ("llm_provider", "Claude", "scripts/register_mcp.py") not in facts
|
||||
assert ("llm_provider", "Claude", "workplans/BRIDGE-WP-0003.md") not in facts
|
||||
|
||||
@@ -759,7 +772,7 @@ def test_analyze_repository_can_use_optional_llm_extractor(tmp_path):
|
||||
}
|
||||
|
||||
|
||||
def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stale_entries(tmp_path):
|
||||
def test_analyze_repository_folds_llm_capabilities_when_ability_comes_from_scope(tmp_path):
|
||||
source = tmp_path / "repo"
|
||||
source.mkdir()
|
||||
(source / "INTENT.md").write_text(
|
||||
@@ -773,6 +786,10 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal
|
||||
"# SCOPE\n\nOld approved entry: route LLM provider requests.\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
(source / "providers.py").write_text(
|
||||
"provider_registry = {'openrouter': object()}\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
store = RegistryStore(tmp_path / "registry.sqlite3")
|
||||
store.initialize()
|
||||
extractor = FakeLLMExtractor(
|
||||
@@ -783,9 +800,17 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal
|
||||
source_paths=["SCOPE.md"],
|
||||
capabilities=[
|
||||
ExtractedCapability(
|
||||
name="Route LLM Provider Requests",
|
||||
description="Old scope reuse.",
|
||||
source_paths=["SCOPE.md"],
|
||||
name="Configure OpenRouter Adapter",
|
||||
description="Source-linked provider adapter.",
|
||||
source_paths=["providers.py"],
|
||||
features=[
|
||||
ExtractedFeature(
|
||||
name="OpenRouter provider registry",
|
||||
type="backend",
|
||||
location="providers.py",
|
||||
source_paths=["providers.py"],
|
||||
)
|
||||
],
|
||||
)
|
||||
],
|
||||
)
|
||||
@@ -807,7 +832,9 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal
|
||||
for ability in graph.abilities
|
||||
for capability in ability.capabilities
|
||||
}
|
||||
assert "Route LLM Provider Requests" in capability_names
|
||||
ability_names = {ability.name for ability in graph.abilities}
|
||||
assert "Old LLM Routing" not in ability_names
|
||||
assert "Configure OpenRouter Adapter" in capability_names
|
||||
assert "Enforce OIDC PKCE Profiles" in capability_names
|
||||
assert decisions[0].action == "llm_extraction_used"
|
||||
assert "llm+deterministic candidate generation" in decisions[0].notes
|
||||
|
||||
Reference in New Issue
Block a user