Ability naming builds on INTENT.md

This commit is contained in:
2026-05-03 00:47:51 +02:00
parent 6c4b0e6dcb
commit ea74722283
4 changed files with 261 additions and 11 deletions

View File

@@ -1,7 +1,7 @@
from __future__ import annotations
import re
from dataclasses import dataclass, field
from dataclasses import dataclass, field, replace
from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference
@@ -123,7 +123,13 @@ class CandidateGraphGenerator:
docs,
)
)
if interfaces and not capabilities:
if interfaces and capabilities:
capabilities = self._attach_interface_features(
capabilities,
interfaces,
chunks,
)
elif interfaces:
capabilities.append(
self._interface_capability(interfaces, tests, examples, docs, chunks)
)
@@ -362,6 +368,71 @@ class CandidateGraphGenerator:
words.pop()
return self._title_from_words(words[:10])
def _attach_interface_features(
self,
capabilities: list[CandidateCapabilityDraft],
interfaces: list[ObservedFact],
chunks: list[ContentChunk],
) -> list[CandidateCapabilityDraft]:
features = self._interface_features(interfaces, chunks)
if not features:
return capabilities
capability_features: dict[int, list[CandidateFeatureDraft]] = {
index: [] for index, _ in enumerate(capabilities)
}
for feature in features:
index = self._best_feature_capability_index(feature, capabilities)
capability_features[index].append(feature)
updated: list[CandidateCapabilityDraft] = []
for index, capability in enumerate(capabilities):
attached = capability_features[index]
if not attached:
updated.append(capability)
continue
updated.append(
replace(
capability,
inputs=capability.inputs or self._feature_inputs(attached),
outputs=capability.outputs or self._feature_outputs(attached),
features=[*capability.features, *attached],
)
)
return updated
def _best_feature_capability_index(
self,
feature: CandidateFeatureDraft,
capabilities: list[CandidateCapabilityDraft],
) -> int:
feature_text = f"{feature.name} {feature.type} {feature.location}".lower()
feature_terms = self._significant_terms(feature_text)
best_index = 0
best_score = -1
for index, capability in enumerate(capabilities):
capability_text = " ".join(
[
capability.name,
capability.description,
" ".join(capability.outputs),
" ".join(capability.attributes),
]
).lower()
capability_terms = self._significant_terms(capability_text)
score = len(feature_terms & capability_terms)
if feature.type == "CLI" and any(
token in capability_text for token in ("cli", "command", "mcp")
):
score += 3
if feature.type == "API" and any(
token in capability_text for token in ("api", "http", "service")
):
score += 3
if score > best_score:
best_index = index
best_score = score
return best_index
def _interface_features(
self,
interfaces: list[ObservedFact],
@@ -424,7 +495,7 @@ class CandidateGraphGenerator:
chunks: list[ContentChunk],
) -> str:
names = [self._feature_name(fact, chunks) for fact in facts]
compact_names = [name for name in names if name]
compact_names = self._unique([name for name in names if name])
if not compact_names:
return f"{len(facts)} entry points"
visible = compact_names[:3]
@@ -623,6 +694,26 @@ class CandidateGraphGenerator:
result.append(item)
return result
def _significant_terms(self, text: str) -> set[str]:
stop_words = {
"and",
"the",
"this",
"that",
"with",
"from",
"into",
"for",
"capability",
"repository",
"service",
}
return {
term
for term in re.findall(r"[a-z0-9]+", text.lower())
if len(term) > 2 and term not in stop_words
}
def _interface_inputs(self, interfaces: list[ObservedFact]) -> list[str]:
feature_types = {self._feature_type(fact) for fact in interfaces}
inputs: list[str] = []
@@ -645,6 +736,28 @@ class CandidateGraphGenerator:
outputs.append("callable interface result")
return outputs
def _feature_inputs(self, features: list[CandidateFeatureDraft]) -> list[str]:
feature_types = {feature.type for feature in features}
inputs: list[str] = []
if "API" in feature_types:
inputs.append("HTTP request")
if "CLI" in feature_types:
inputs.append("CLI arguments")
if not inputs:
inputs.append("caller input")
return inputs
def _feature_outputs(self, features: list[CandidateFeatureDraft]) -> list[str]:
feature_types = {feature.type for feature in features}
outputs: list[str] = []
if "API" in feature_types:
outputs.append("HTTP response")
if "CLI" in feature_types:
outputs.append("command output")
if not outputs:
outputs.append("callable interface result")
return outputs
def _feature_name(self, fact: ObservedFact, chunks: list[ContentChunk]) -> str:
route_name = self._route_feature_name(fact.value)
if route_name:
@@ -797,7 +910,7 @@ class CandidateGraphGenerator:
return f"Support {self._humanize_identifier(repository.name)}"
def _document_purpose_sentence(self, chunks: list[ContentChunk]) -> str:
for chunk in self._documentation_chunks(chunks):
for chunk in self._purpose_chunks(chunks):
if chunk.kind not in {"intent", "documentation"}:
continue
lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
@@ -806,6 +919,28 @@ class CandidateGraphGenerator:
return paragraph
return ""
def _purpose_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]:
def priority(chunk: ContentChunk) -> tuple[int, str, int]:
role = chunk.metadata.get("source_role")
path = chunk.path.lower()
if role == "intent_summary" or path.endswith("intent.md"):
return (0, path, chunk.start_line)
if role == "product_documentation" or path.startswith("readme"):
return (1, path, chunk.start_line)
if role == "derived_scope" or path.endswith("scope.md"):
return (3, path, chunk.start_line)
return (2, path, chunk.start_line)
return sorted(
[
chunk
for chunk in chunks
if chunk.kind in {"intent", "documentation"}
and chunk.metadata.get("source_role") != "agent_guidance"
],
key=priority,
)
def _operations_ability_name(self, chunks: list[ContentChunk]) -> str:
text = " ".join(
chunk.text

View File

@@ -268,9 +268,50 @@ class RegistryService:
extracted = self.llm_extractor.extract(repository, chunks)
if extracted:
llm_candidates = self.llm_mapper.map(extracted, facts, chunks)
return llm_candidates + deterministic, "llm+deterministic"
return (
self._merge_llm_candidates(llm_candidates, deterministic),
"llm+deterministic",
)
return deterministic, "deterministic"
def _merge_llm_candidates(
self,
llm_candidates: list,
deterministic: list,
) -> list:
if not deterministic:
return [
ability
for ability in llm_candidates
if self._candidate_ability_has_trusted_sources(ability)
]
merged_deterministic = list(deterministic)
trusted_llm = []
folded_capabilities = []
for ability in llm_candidates:
if self._candidate_ability_has_trusted_sources(ability):
trusted_llm.append(ability)
else:
folded_capabilities.extend(ability.capabilities)
if folded_capabilities:
target = merged_deterministic[0]
merged_deterministic[0] = replace(
target,
capabilities=[*target.capabilities, *folded_capabilities],
)
return [*trusted_llm, *merged_deterministic]
def _candidate_ability_has_trusted_sources(self, ability) -> bool:
if not ability.source_refs:
return False
return any(
ref.kind in {"intent", "documentation", "interface", "test", "example"}
and not ref.path.lower().endswith("scope.md")
for ref in ability.source_refs
)
def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]:
return self.store.list_analysis_runs(repository_id)

View File

@@ -135,6 +135,53 @@ def test_candidate_generator_extracts_intended_capability_blocks_from_intent_chu
assert [ref.path for ref in intent_capability.source_refs] == ["INTENT.md"]
def test_candidate_generator_prefers_intent_over_derived_scope_for_ability_name():
repository = Repository(
id=1,
name="LLMConnect",
url="/tmp/llm-connect",
description=None,
branch="main",
status="analyzed",
)
facts = [
fact(
1,
"intent",
"INTENT",
"INTENT.md",
metadata={"source_role": "intent_summary"},
),
fact(
2,
"documentation",
"SCOPE",
"SCOPE.md",
metadata={"source_role": "derived_scope"},
),
]
chunks = [
chunk(
1,
"documentation",
"SCOPE.md",
"# SCOPE\n\nA stale first paragraph copied from another repository.",
),
chunk(
2,
"intent",
"INTENT.md",
"# INTENT\n\nProvide a provider-agnostic LLM connector.",
),
]
chunks[0].metadata["source_role"] = "derived_scope"
chunks[1].metadata["source_role"] = "intent_summary"
graph = CandidateGraphGenerator().generate(repository, facts, chunks)
assert graph[0].name == "Provide A Provider-agnostic LLM Connector"
def test_candidate_generator_enriches_descriptions_from_content_chunks():
repository = Repository(
id=1,

View File

@@ -4,7 +4,11 @@ import subprocess
from repo_registry.core.logging import LOGGER_NAME
from repo_registry.core.service import RegistryService
from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability
from repo_registry.llm_extraction import (
ExtractedAbility,
ExtractedCapability,
ExtractedFeature,
)
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.semantic import HashingEmbeddingProvider
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
@@ -522,6 +526,15 @@ def test_regression_ops_bridge_like_repo_is_it_operations_not_llm_provider(tmp_p
assert "Maintain Continuous Connectivity Between Remote Systems And Central Hub" in capability_names
assert "Make Connectivity Observable Auditable And Controllable" in capability_names
assert "Expose CLI And MCP Accessible Service" in capability_names
cli_capability = next(
capability
for candidate_ability in graph.abilities
for capability in candidate_ability.capabilities
if capability.name == "Expose CLI And MCP Accessible Service"
)
assert {feature.name for feature in cli_capability.features} == {
"CLI command surface: CLI command up"
}
assert ("llm_provider", "Claude", "scripts/register_mcp.py") not in facts
assert ("llm_provider", "Claude", "workplans/BRIDGE-WP-0003.md") not in facts
@@ -759,7 +772,7 @@ def test_analyze_repository_can_use_optional_llm_extractor(tmp_path):
}
def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stale_entries(tmp_path):
def test_analyze_repository_folds_llm_capabilities_when_ability_comes_from_scope(tmp_path):
source = tmp_path / "repo"
source.mkdir()
(source / "INTENT.md").write_text(
@@ -773,6 +786,10 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal
"# SCOPE\n\nOld approved entry: route LLM provider requests.\n",
encoding="utf-8",
)
(source / "providers.py").write_text(
"provider_registry = {'openrouter': object()}\n",
encoding="utf-8",
)
store = RegistryStore(tmp_path / "registry.sqlite3")
store.initialize()
extractor = FakeLLMExtractor(
@@ -783,9 +800,17 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal
source_paths=["SCOPE.md"],
capabilities=[
ExtractedCapability(
name="Route LLM Provider Requests",
description="Old scope reuse.",
source_paths=["SCOPE.md"],
name="Configure OpenRouter Adapter",
description="Source-linked provider adapter.",
source_paths=["providers.py"],
features=[
ExtractedFeature(
name="OpenRouter provider registry",
type="backend",
location="providers.py",
source_paths=["providers.py"],
)
],
)
],
)
@@ -807,7 +832,9 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal
for ability in graph.abilities
for capability in ability.capabilities
}
assert "Route LLM Provider Requests" in capability_names
ability_names = {ability.name for ability in graph.abilities}
assert "Old LLM Routing" not in ability_names
assert "Configure OpenRouter Adapter" in capability_names
assert "Enforce OIDC PKCE Profiles" in capability_names
assert decisions[0].action == "llm_extraction_used"
assert "llm+deterministic candidate generation" in decisions[0].notes