From ea7472228399399f53b559c5559342948070bc63 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 3 May 2026 00:47:51 +0200 Subject: [PATCH] Ability naming builds on INTENT.md --- .../candidate_graph/generator.py | 143 +++++++++++++++++- src/repo_registry/core/service.py | 43 +++++- tests/test_candidate_graph.py | 47 ++++++ tests/test_registry_service.py | 39 ++++- 4 files changed, 261 insertions(+), 11 deletions(-) diff --git a/src/repo_registry/candidate_graph/generator.py b/src/repo_registry/candidate_graph/generator.py index 72240b1..99ad26e 100644 --- a/src/repo_registry/candidate_graph/generator.py +++ b/src/repo_registry/candidate_graph/generator.py @@ -1,7 +1,7 @@ from __future__ import annotations import re -from dataclasses import dataclass, field +from dataclasses import dataclass, field, replace from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference @@ -123,7 +123,13 @@ class CandidateGraphGenerator: docs, ) ) - if interfaces and not capabilities: + if interfaces and capabilities: + capabilities = self._attach_interface_features( + capabilities, + interfaces, + chunks, + ) + elif interfaces: capabilities.append( self._interface_capability(interfaces, tests, examples, docs, chunks) ) @@ -362,6 +368,71 @@ class CandidateGraphGenerator: words.pop() return self._title_from_words(words[:10]) + def _attach_interface_features( + self, + capabilities: list[CandidateCapabilityDraft], + interfaces: list[ObservedFact], + chunks: list[ContentChunk], + ) -> list[CandidateCapabilityDraft]: + features = self._interface_features(interfaces, chunks) + if not features: + return capabilities + capability_features: dict[int, list[CandidateFeatureDraft]] = { + index: [] for index, _ in enumerate(capabilities) + } + for feature in features: + index = self._best_feature_capability_index(feature, capabilities) + capability_features[index].append(feature) + + updated: list[CandidateCapabilityDraft] = [] + for index, capability in enumerate(capabilities): + attached = capability_features[index] + if not attached: + updated.append(capability) + continue + updated.append( + replace( + capability, + inputs=capability.inputs or self._feature_inputs(attached), + outputs=capability.outputs or self._feature_outputs(attached), + features=[*capability.features, *attached], + ) + ) + return updated + + def _best_feature_capability_index( + self, + feature: CandidateFeatureDraft, + capabilities: list[CandidateCapabilityDraft], + ) -> int: + feature_text = f"{feature.name} {feature.type} {feature.location}".lower() + feature_terms = self._significant_terms(feature_text) + best_index = 0 + best_score = -1 + for index, capability in enumerate(capabilities): + capability_text = " ".join( + [ + capability.name, + capability.description, + " ".join(capability.outputs), + " ".join(capability.attributes), + ] + ).lower() + capability_terms = self._significant_terms(capability_text) + score = len(feature_terms & capability_terms) + if feature.type == "CLI" and any( + token in capability_text for token in ("cli", "command", "mcp") + ): + score += 3 + if feature.type == "API" and any( + token in capability_text for token in ("api", "http", "service") + ): + score += 3 + if score > best_score: + best_index = index + best_score = score + return best_index + def _interface_features( self, interfaces: list[ObservedFact], @@ -424,7 +495,7 @@ class CandidateGraphGenerator: chunks: list[ContentChunk], ) -> str: names = [self._feature_name(fact, chunks) for fact in facts] - compact_names = [name for name in names if name] + compact_names = self._unique([name for name in names if name]) if not compact_names: return f"{len(facts)} entry points" visible = compact_names[:3] @@ -623,6 +694,26 @@ class CandidateGraphGenerator: result.append(item) return result + def _significant_terms(self, text: str) -> set[str]: + stop_words = { + "and", + "the", + "this", + "that", + "with", + "from", + "into", + "for", + "capability", + "repository", + "service", + } + return { + term + for term in re.findall(r"[a-z0-9]+", text.lower()) + if len(term) > 2 and term not in stop_words + } + def _interface_inputs(self, interfaces: list[ObservedFact]) -> list[str]: feature_types = {self._feature_type(fact) for fact in interfaces} inputs: list[str] = [] @@ -645,6 +736,28 @@ class CandidateGraphGenerator: outputs.append("callable interface result") return outputs + def _feature_inputs(self, features: list[CandidateFeatureDraft]) -> list[str]: + feature_types = {feature.type for feature in features} + inputs: list[str] = [] + if "API" in feature_types: + inputs.append("HTTP request") + if "CLI" in feature_types: + inputs.append("CLI arguments") + if not inputs: + inputs.append("caller input") + return inputs + + def _feature_outputs(self, features: list[CandidateFeatureDraft]) -> list[str]: + feature_types = {feature.type for feature in features} + outputs: list[str] = [] + if "API" in feature_types: + outputs.append("HTTP response") + if "CLI" in feature_types: + outputs.append("command output") + if not outputs: + outputs.append("callable interface result") + return outputs + def _feature_name(self, fact: ObservedFact, chunks: list[ContentChunk]) -> str: route_name = self._route_feature_name(fact.value) if route_name: @@ -797,7 +910,7 @@ class CandidateGraphGenerator: return f"Support {self._humanize_identifier(repository.name)}" def _document_purpose_sentence(self, chunks: list[ContentChunk]) -> str: - for chunk in self._documentation_chunks(chunks): + for chunk in self._purpose_chunks(chunks): if chunk.kind not in {"intent", "documentation"}: continue lines = [line.strip() for line in chunk.text.splitlines() if line.strip()] @@ -806,6 +919,28 @@ class CandidateGraphGenerator: return paragraph return "" + def _purpose_chunks(self, chunks: list[ContentChunk]) -> list[ContentChunk]: + def priority(chunk: ContentChunk) -> tuple[int, str, int]: + role = chunk.metadata.get("source_role") + path = chunk.path.lower() + if role == "intent_summary" or path.endswith("intent.md"): + return (0, path, chunk.start_line) + if role == "product_documentation" or path.startswith("readme"): + return (1, path, chunk.start_line) + if role == "derived_scope" or path.endswith("scope.md"): + return (3, path, chunk.start_line) + return (2, path, chunk.start_line) + + return sorted( + [ + chunk + for chunk in chunks + if chunk.kind in {"intent", "documentation"} + and chunk.metadata.get("source_role") != "agent_guidance" + ], + key=priority, + ) + def _operations_ability_name(self, chunks: list[ContentChunk]) -> str: text = " ".join( chunk.text diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 1604f30..aa6b51b 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -268,9 +268,50 @@ class RegistryService: extracted = self.llm_extractor.extract(repository, chunks) if extracted: llm_candidates = self.llm_mapper.map(extracted, facts, chunks) - return llm_candidates + deterministic, "llm+deterministic" + return ( + self._merge_llm_candidates(llm_candidates, deterministic), + "llm+deterministic", + ) return deterministic, "deterministic" + def _merge_llm_candidates( + self, + llm_candidates: list, + deterministic: list, + ) -> list: + if not deterministic: + return [ + ability + for ability in llm_candidates + if self._candidate_ability_has_trusted_sources(ability) + ] + + merged_deterministic = list(deterministic) + trusted_llm = [] + folded_capabilities = [] + for ability in llm_candidates: + if self._candidate_ability_has_trusted_sources(ability): + trusted_llm.append(ability) + else: + folded_capabilities.extend(ability.capabilities) + + if folded_capabilities: + target = merged_deterministic[0] + merged_deterministic[0] = replace( + target, + capabilities=[*target.capabilities, *folded_capabilities], + ) + return [*trusted_llm, *merged_deterministic] + + def _candidate_ability_has_trusted_sources(self, ability) -> bool: + if not ability.source_refs: + return False + return any( + ref.kind in {"intent", "documentation", "interface", "test", "example"} + and not ref.path.lower().endswith("scope.md") + for ref in ability.source_refs + ) + def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]: return self.store.list_analysis_runs(repository_id) diff --git a/tests/test_candidate_graph.py b/tests/test_candidate_graph.py index ac9c1a0..fcfc009 100644 --- a/tests/test_candidate_graph.py +++ b/tests/test_candidate_graph.py @@ -135,6 +135,53 @@ def test_candidate_generator_extracts_intended_capability_blocks_from_intent_chu assert [ref.path for ref in intent_capability.source_refs] == ["INTENT.md"] +def test_candidate_generator_prefers_intent_over_derived_scope_for_ability_name(): + repository = Repository( + id=1, + name="LLMConnect", + url="/tmp/llm-connect", + description=None, + branch="main", + status="analyzed", + ) + facts = [ + fact( + 1, + "intent", + "INTENT", + "INTENT.md", + metadata={"source_role": "intent_summary"}, + ), + fact( + 2, + "documentation", + "SCOPE", + "SCOPE.md", + metadata={"source_role": "derived_scope"}, + ), + ] + chunks = [ + chunk( + 1, + "documentation", + "SCOPE.md", + "# SCOPE\n\nA stale first paragraph copied from another repository.", + ), + chunk( + 2, + "intent", + "INTENT.md", + "# INTENT\n\nProvide a provider-agnostic LLM connector.", + ), + ] + chunks[0].metadata["source_role"] = "derived_scope" + chunks[1].metadata["source_role"] = "intent_summary" + + graph = CandidateGraphGenerator().generate(repository, facts, chunks) + + assert graph[0].name == "Provide A Provider-agnostic LLM Connector" + + def test_candidate_generator_enriches_descriptions_from_content_chunks(): repository = Repository( id=1, diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index 2359539..d9dcd80 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -4,7 +4,11 @@ import subprocess from repo_registry.core.logging import LOGGER_NAME from repo_registry.core.service import RegistryService -from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability +from repo_registry.llm_extraction import ( + ExtractedAbility, + ExtractedCapability, + ExtractedFeature, +) from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.semantic import HashingEmbeddingProvider from repo_registry.storage.sqlite import NotFoundError, RegistryStore @@ -522,6 +526,15 @@ def test_regression_ops_bridge_like_repo_is_it_operations_not_llm_provider(tmp_p assert "Maintain Continuous Connectivity Between Remote Systems And Central Hub" in capability_names assert "Make Connectivity Observable Auditable And Controllable" in capability_names assert "Expose CLI And MCP Accessible Service" in capability_names + cli_capability = next( + capability + for candidate_ability in graph.abilities + for capability in candidate_ability.capabilities + if capability.name == "Expose CLI And MCP Accessible Service" + ) + assert {feature.name for feature in cli_capability.features} == { + "CLI command surface: CLI command up" + } assert ("llm_provider", "Claude", "scripts/register_mcp.py") not in facts assert ("llm_provider", "Claude", "workplans/BRIDGE-WP-0003.md") not in facts @@ -759,7 +772,7 @@ def test_analyze_repository_can_use_optional_llm_extractor(tmp_path): } -def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stale_entries(tmp_path): +def test_analyze_repository_folds_llm_capabilities_when_ability_comes_from_scope(tmp_path): source = tmp_path / "repo" source.mkdir() (source / "INTENT.md").write_text( @@ -773,6 +786,10 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal "# SCOPE\n\nOld approved entry: route LLM provider requests.\n", encoding="utf-8", ) + (source / "providers.py").write_text( + "provider_registry = {'openrouter': object()}\n", + encoding="utf-8", + ) store = RegistryStore(tmp_path / "registry.sqlite3") store.initialize() extractor = FakeLLMExtractor( @@ -783,9 +800,17 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal source_paths=["SCOPE.md"], capabilities=[ ExtractedCapability( - name="Route LLM Provider Requests", - description="Old scope reuse.", - source_paths=["SCOPE.md"], + name="Configure OpenRouter Adapter", + description="Source-linked provider adapter.", + source_paths=["providers.py"], + features=[ + ExtractedFeature( + name="OpenRouter provider registry", + type="backend", + location="providers.py", + source_paths=["providers.py"], + ) + ], ) ], ) @@ -807,7 +832,9 @@ def test_analyze_repository_keeps_deterministic_candidates_when_llm_returns_stal for ability in graph.abilities for capability in ability.capabilities } - assert "Route LLM Provider Requests" in capability_names + ability_names = {ability.name for ability in graph.abilities} + assert "Old LLM Routing" not in ability_names + assert "Configure OpenRouter Adapter" in capability_names assert "Enforce OIDC PKCE Profiles" in capability_names assert decisions[0].action == "llm_extraction_used" assert "llm+deterministic candidate generation" in decisions[0].notes