improved scanner

This commit is contained in:
2026-05-02 00:42:58 +02:00
parent 204a94c42c
commit 56bc86b2df
7 changed files with 162 additions and 9 deletions

View File

@@ -58,6 +58,15 @@ new intent file with a clear provenance note. After that bootstrap, the files
should diverge naturally: `INTENT.md` remains design intent, while `SCOPE.md`
remains generated or curated current scope.
Provider, dependency, and tooling facts should also carry a utility
relationship. A provider mentioned in documentation is usually a `mention`; an
environment variable is usually `configure`; a manifest entry is usually
`dependency`; implementation code under provider or adapter modules may be
`owned` or `adapter`. Candidate generation should promote only relationships
that show the repository provides the utility directly or intentionally exposes
it as a facade/adapter. Mentions, dependencies, configuration, and tooling are
context until a curator promotes them or stronger owned evidence appears.
Source references point from interpreted claims back to files or facts.
Evidence is support for a characteristic. It is not the same thing as an observed

View File

@@ -56,6 +56,10 @@ normalization.
`intent_summary`, `derived_scope`, `product_documentation`,
`implementation_source`, `dependency_declaration`, `configuration`,
`ci_tooling`, `test_evidence`, or `agent_guidance`.
- Utility relationship: metadata describing how a fact relates to repository
utility, such as `owned`, `facade`, `adapter`, `configure`, `dependency`,
`tooling`, or `mention`. Only owned/facade/adapter relationships should be
promoted directly into provided capabilities.
- Candidate: proposed characteristic or evidence from deterministic heuristics
or optional LLM assistance. Candidates are review inputs, not registry truth.
- Approved: curated registry truth that appears in ability maps, search, exports,

View File

@@ -103,7 +103,10 @@ class CandidateGraphGenerator:
capabilities.append(
self._interface_capability(interfaces, tests, examples, docs, chunks)
)
if llm_providers or provider_registries or fallback_policies:
promotable_llm_facts = self._promotable_llm_facts(
llm_providers + provider_registries + fallback_policies
)
if promotable_llm_facts:
capabilities.append(
self._llm_provider_capability(
llm_providers,
@@ -269,6 +272,8 @@ class CandidateGraphGenerator:
credentials,
registries,
fallback_policies,
) + self._utility_relationship_attributes(
providers + credentials + registries + fallback_policies
),
features=features,
evidence=self._evidence(tests, examples, docs),
@@ -761,6 +766,37 @@ class CandidateGraphGenerator:
def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]:
return [fact for fact in facts if fact.kind == kind]
def _promotable_llm_facts(self, facts: list[ObservedFact]) -> list[ObservedFact]:
return [
fact
for fact in facts
if self._utility_relationship(fact) in {"owned", "facade", "adapter"}
]
def _utility_relationship(self, fact: ObservedFact) -> str:
relationship = fact.metadata.get("utility_relationship")
if isinstance(relationship, str) and relationship:
return relationship
source_role = fact.metadata.get("source_role")
if source_role == "implementation_source":
lower_path = fact.path.lower()
if "adapter" in lower_path or "provider" in lower_path:
return "adapter"
return "owned"
if source_role == "configuration":
return "configure"
if source_role == "dependency_declaration":
return "dependency"
if source_role in {"agent_guidance", "ci_tooling"}:
return "tooling"
if not source_role and fact.path.lower().endswith((".py", ".ts", ".js")):
return "owned"
return "mention"
def _utility_relationship_attributes(self, facts: list[ObservedFact]) -> list[str]:
relationships = sorted({self._utility_relationship(fact) for fact in facts})
return [f"utility-{relationship}" for relationship in relationships]
def _source_refs(self, facts: list[ObservedFact]) -> list[SourceReference]:
return [
SourceReference(

View File

@@ -358,6 +358,10 @@ class DeterministicScanner:
source_role = self._source_role(relative)
if source_role == "agent_guidance":
continue
utility_relationship = self._provider_utility_relationship(
source_role,
relative,
)
for needle, provider in LLM_PROVIDER_HINTS.items():
if not self._has_provider_signal(lower_text, needle):
continue
@@ -372,6 +376,7 @@ class DeterministicScanner:
metadata={
"source": "provider_hint",
"source_role": source_role,
"utility_relationship": utility_relationship,
},
),
)
@@ -389,6 +394,7 @@ class DeterministicScanner:
metadata={
"source": "environment_variable",
"source_role": source_role,
"utility_relationship": "configure",
},
),
)
@@ -412,6 +418,7 @@ class DeterministicScanner:
metadata={
"source": "provider_registry_hint",
"source_role": source_role,
"utility_relationship": utility_relationship,
},
),
)
@@ -429,11 +436,30 @@ class DeterministicScanner:
metadata={
"source": "fallback_hint",
"source_role": source_role,
"utility_relationship": utility_relationship,
},
),
)
return facts
def _provider_utility_relationship(
self,
source_role: str,
relative_path: str,
) -> str:
if source_role == "implementation_source":
lower = relative_path.lower()
if "adapter" in lower or "provider" in lower:
return "adapter"
return "owned"
if source_role == "configuration":
return "configure"
if source_role == "dependency_declaration":
return "dependency"
if source_role in {"ci_tooling", "agent_guidance"}:
return "tooling"
return "mention"
def _source_role(self, relative_path: str) -> str:
lower = relative_path.lower()
parts = lower.split("/")

View File

@@ -2,7 +2,7 @@ from repo_registry.candidate_graph.generator import CandidateGraphGenerator
from repo_registry.core.models import ContentChunk, ObservedFact, Repository
def fact(id, kind, name, path="", value=""):
def fact(id, kind, name, path="", value="", metadata=None):
return ObservedFact(
id=id,
repository_id=1,
@@ -12,7 +12,7 @@ def fact(id, kind, name, path="", value=""):
path=path,
name=name,
value=value,
metadata={},
metadata=metadata or {},
)
@@ -310,11 +310,44 @@ def test_candidate_generator_maps_llm_provider_facts_to_capability():
)
facts = [
fact(1, "documentation", "README", "README.md"),
fact(2, "llm_provider", "OpenRouter", "providers.py", "openrouter"),
fact(3, "llm_provider", "Claude", "providers.py", "claude"),
fact(4, "credential_config", "OpenRouter API key", ".env.example", "OPENROUTER_API_KEY"),
fact(5, "provider_registry", "LLM provider registry", "providers.py"),
fact(6, "fallback_policy", "LLM provider fallback policy", "providers.py"),
fact(
2,
"llm_provider",
"OpenRouter",
"providers.py",
"openrouter",
{"source_role": "implementation_source", "utility_relationship": "adapter"},
),
fact(
3,
"llm_provider",
"Claude",
"providers.py",
"claude",
{"source_role": "implementation_source", "utility_relationship": "adapter"},
),
fact(
4,
"credential_config",
"OpenRouter API key",
".env.example",
"OPENROUTER_API_KEY",
{"source_role": "configuration", "utility_relationship": "configure"},
),
fact(
5,
"provider_registry",
"LLM provider registry",
"providers.py",
metadata={"source_role": "implementation_source", "utility_relationship": "adapter"},
),
fact(
6,
"fallback_policy",
"LLM provider fallback policy",
"providers.py",
metadata={"source_role": "implementation_source", "utility_relationship": "adapter"},
),
]
graph = CandidateGraphGenerator().generate(repository, facts)
@@ -329,6 +362,7 @@ def test_candidate_generator_maps_llm_provider_facts_to_capability():
assert {"llm-provider", "openrouter", "claude", "fallback-policy"} <= set(
capability.attributes
)
assert {"utility-adapter", "utility-configure"} <= set(capability.attributes)
feature_names = {feature.name for feature in capability.features}
assert {"Use OpenRouter Models", "Use Claude Models"} <= feature_names
assert "Configure LLM Provider Credentials" in feature_names
@@ -338,4 +372,34 @@ def test_candidate_generator_maps_llm_provider_facts_to_capability():
feature for feature in capability.features if feature.name == "Use OpenRouter Models"
)
assert openrouter_feature.primary_class == "integration"
def test_candidate_generator_does_not_promote_llm_provider_mentions_to_capability():
repository = Repository(
id=1,
name="MentionOnly",
url="/tmp/mention-only",
description=None,
branch="main",
status="analyzed",
)
facts = [
fact(1, "documentation", "README", "README.md"),
fact(
2,
"llm_provider",
"Claude",
"README.md",
"claude",
{"source_role": "product_documentation", "utility_relationship": "mention"},
),
]
graph = CandidateGraphGenerator().generate(repository, facts)
assert [
capability.name
for capability in graph[0].capabilities
if capability.name == "Route LLM Requests Across Providers"
] == []
assert {"llm-provider", "openrouter"} <= set(openrouter_feature.attributes)

View File

@@ -142,6 +142,20 @@ def test_scanner_records_llm_provider_and_fallback_facts(tmp_path):
assert ("provider_registry", "LLM provider registry", "providers.py") in facts
assert ("fallback_policy", "LLM provider fallback policy", "README.md") in facts
by_key = {(fact.kind, fact.name, fact.path): fact for fact in result.facts}
assert by_key[("llm_provider", "OpenRouter", "README.md")].metadata[
"utility_relationship"
] == "mention"
assert by_key[("llm_provider", "OpenRouter", "providers.py")].metadata[
"utility_relationship"
] == "adapter"
assert by_key[("credential_config", "OpenRouter API key", ".env.example")].metadata[
"utility_relationship"
] == "configure"
assert by_key[("provider_registry", "LLM provider registry", "providers.py")].metadata[
"utility_relationship"
] == "adapter"
def test_scanner_does_not_treat_agent_guidance_as_llm_provider(tmp_path):
repo = tmp_path / "key-cape-like"

View File

@@ -98,7 +98,7 @@ Acceptance criteria:
```task
id: RREG-WP-0009-T03
status: todo
status: in_progress
priority: high
state_hub_task_id: "3b8bac53-6a14-43b3-9a59-e15c24c0cd6e"
```