baseline repo characteristics no longer crowd the candidate graph

This commit is contained in:
2026-05-03 00:14:59 +02:00
parent 4672ac6edc
commit 6c4b0e6dcb
7 changed files with 338 additions and 64 deletions

View File

@@ -100,10 +100,6 @@ class CandidateGraphGenerator:
)
capabilities: list[CandidateCapabilityDraft] = []
if interfaces:
capabilities.append(
self._interface_capability(interfaces, tests, examples, docs, chunks)
)
capabilities.extend(
self._intent_capabilities(intent_facts, chunks, tests, examples, docs)
)
@@ -127,31 +123,9 @@ class CandidateGraphGenerator:
docs,
)
)
if manifests or frameworks or languages:
if interfaces and not capabilities:
capabilities.append(
CandidateCapabilityDraft(
name="Describe Repository Structure",
description=(
"Summarize detected languages, package manifests, and framework "
"hints as structural context for review."
),
inputs=[],
outputs=["repository structure summary"],
confidence=self._structure_confidence(
manifests=manifests,
frameworks=frameworks,
languages=languages,
docs=docs,
),
source_refs=self._source_refs(manifests + frameworks + languages),
primary_class="repository-structure",
attributes=self._structure_attributes(
manifests,
frameworks,
languages,
),
evidence=self._evidence(tests, examples, docs),
)
self._interface_capability(interfaces, tests, examples, docs, chunks)
)
return [
@@ -356,7 +330,10 @@ class CandidateGraphGenerator:
continue
if line.startswith("#"):
heading = line.lstrip("#").strip().lower()
in_capability_section = "capabilit" in heading
in_capability_section = (
"capabilit" in heading
or heading in {"primary utility", "core utility"}
)
continue
if not in_capability_section:
continue
@@ -367,11 +344,23 @@ class CandidateGraphGenerator:
return items
def _intent_capability_name(self, text: str) -> str:
lowered = re.sub(r"[*_`]", "", text.lower())
if "continuous connectivity" in lowered and "remote systems" in lowered:
return "Maintain Continuous Connectivity Between Remote Systems And Central Hub"
if "observable" in lowered and "auditable" in lowered and "controllable" in lowered:
return "Make Connectivity Observable Auditable And Controllable"
if "cli tool" in lowered and "mcp" in lowered:
return "Expose CLI And MCP Accessible Service"
candidate = re.split(r"\s+-\s+|\s*:\s*|[.!?]\s+", text.strip(), maxsplit=1)[0]
candidate = candidate.strip(" .:-")
if not candidate:
return ""
return self._title_from_words(candidate.split()[:8])
words = candidate.split()
if words:
words[0] = self._imperative_verb(words[0])
while words and words[-1].lower().strip(",;:") in {"a", "an", "the", "and", "or", "as", "both"}:
words.pop()
return self._title_from_words(words[:10])
def _interface_features(
self,
@@ -508,16 +497,36 @@ class CandidateGraphGenerator:
[
repository.name,
repository.description or "",
" ".join(chunk.text[:600] for chunk in chunks if chunk.kind == "documentation"),
" ".join(f"{fact.kind} {fact.name} {fact.value}" for fact in facts),
" ".join(
chunk.text[:600]
for chunk in chunks
if chunk.kind in {"intent", "documentation"}
and chunk.metadata.get("source_role") != "agent_guidance"
),
" ".join(
f"{fact.kind} {fact.name} {fact.value}"
for fact in facts
if not (
fact.kind == "llm_provider"
and self._utility_relationship(fact) not in {"owned", "facade", "adapter"}
)
),
]
).lower()
attributes: list[str] = []
if any(token in text for token in ("repository", "repo", "registry")):
attributes.append("repository")
if any(token in text for token in ("ssh", "tunnel", "reverse tunnel", "remote access", "connectivity")):
attributes.extend(["remote-access", "connectivity"])
if any(token in text for token in ("audit", "health check", "lifecycle", "ops", "operator")):
attributes.append("operations")
return "it-operations", self._unique(attributes)
if any(token in text for token in ("ability", "capability", "feature")):
return "repository-intelligence", self._unique(attributes + ["capability-mapping"])
if any(token in text for token in ("llm", "openrouter", "claude", "model provider")):
promotable_llm = any(
fact.kind == "llm_provider"
and self._utility_relationship(fact) in {"owned", "facade", "adapter"}
for fact in facts
)
if promotable_llm:
return "ai-integration", self._unique(attributes + ["llm-provider"])
if any(fact.kind == "interface" for fact in facts):
attributes.append("interface")
@@ -777,6 +786,9 @@ class CandidateGraphGenerator:
repository: Repository,
chunks: list[ContentChunk],
) -> str:
ops_name = self._operations_ability_name(chunks)
if ops_name:
return ops_name
purpose_text = self._document_purpose_sentence(chunks) or repository.description
if purpose_text:
normalized = self._imperative_purpose(purpose_text)
@@ -794,9 +806,24 @@ class CandidateGraphGenerator:
return paragraph
return ""
def _operations_ability_name(self, chunks: list[ContentChunk]) -> str:
text = " ".join(
chunk.text
for chunk in self._documentation_chunks(chunks)
if chunk.kind == "intent"
).lower()
if "ssh reverse tunnel" in text or "ssh reverse tunneling" in text:
return "Manage SSH Reverse Tunnel Connectivity"
return ""
def _imperative_purpose(self, text: str) -> str:
cleaned = re.sub(r"\s+", " ", text.strip())
cleaned = re.split(r"[.!?]\s+", cleaned, maxsplit=1)[0]
cleaned = re.sub(
r"(?i)^this\s+repository\s+exists\s+to\s+provide\s+(?:an?\s+)?",
"Provide ",
cleaned,
)
cleaned = re.sub(r"^[A-Z][A-Za-z0-9_-]*\s+(?:is|provides|offers)\s+", "", cleaned)
cleaned = cleaned.strip(" .:-")
if not cleaned:
@@ -816,6 +843,8 @@ class CandidateGraphGenerator:
}
if lower in irregular:
return irregular[lower]
if lower in {"this"}:
return lower
if lower.endswith("ies") and len(lower) > 4:
return f"{lower[:-3]}y"
if lower.endswith(("des", "ses", "tes", "ves", "zes")) and len(lower) > 4:

View File

@@ -474,7 +474,11 @@ class DeterministicScanner:
return "ci_tooling"
if lower.startswith(("tests/", "test/")) or name.startswith("test_"):
return "test_evidence"
if name.startswith("readme") or lower.startswith(("docs/", "doc/", "wiki/")):
if (
name.startswith("readme")
or name.endswith(".md")
or lower.startswith(("docs/", "doc/", "wiki/", "workplans/", "architecture/"))
):
return "product_documentation"
if name in MANIFEST_FRAMEWORK_HINTS or name.endswith((".lock", ".mod")):
return "dependency_declaration"
@@ -483,13 +487,21 @@ class DeterministicScanner:
return "implementation_source"
def _has_provider_signal(self, lower_text: str, needle: str) -> bool:
pattern = re.compile(rf"(?<![a-z0-9-]){re.escape(needle.lower())}(?![a-z0-9-])")
if f"{needle.lower()}_api_key" in lower_text:
return True
pattern = re.compile(rf"(?<![a-z0-9_-]){re.escape(needle.lower())}(?![a-z0-9_-])")
for match in pattern.finditer(lower_text):
context = lower_text[max(0, match.start() - 20) : match.end() + 20]
if needle == "claude" and (
"claude.md" in context
or "claude code" in context
or "claude.ai/code" in context
or "claude mcp" in context
or "mcp" in context
or ".claude" in context
or "claude.json" in context
or "claude plugin" in context
or "claude prompt" in context
):
continue
return True

View File

@@ -541,6 +541,19 @@ def render_analysis_diagnostics(
),
)
)
elif capability_count == 0:
notices.append(
(
"warn",
"No domain capabilities were produced.",
(
"The scanner found repository evidence, but only baseline "
"context or weak documentation was available. If this "
"repository should provide concrete capabilities, record an "
"expectation gap for the missing behavior."
),
)
)
elif only_weak_candidates:
notices.append(
(