baseline repo characteristics no longer crowd the candidate graph

2026-05-03 00:14:59 +02:00
parent 4672ac6edc
commit 6c4b0e6dcb
7 changed files with 338 additions and 64 deletions
--- a/src/repo_registry/candidate_graph/generator.py
+++ b/src/repo_registry/candidate_graph/generator.py
@@ -100,10 +100,6 @@ class CandidateGraphGenerator:
        )

        capabilities: list[CandidateCapabilityDraft] = []
-        if interfaces:
-            capabilities.append(
-                self._interface_capability(interfaces, tests, examples, docs, chunks)
-            )
        capabilities.extend(
            self._intent_capabilities(intent_facts, chunks, tests, examples, docs)
        )
@@ -127,31 +123,9 @@ class CandidateGraphGenerator:
                    docs,
                )
            )
-        if manifests or frameworks or languages:
+        if interfaces and not capabilities:
            capabilities.append(
-                CandidateCapabilityDraft(
-                    name="Describe Repository Structure",
-                    description=(
-                        "Summarize detected languages, package manifests, and framework "
-                        "hints as structural context for review."
-                    ),
-                    inputs=[],
-                    outputs=["repository structure summary"],
-                    confidence=self._structure_confidence(
-                        manifests=manifests,
-                        frameworks=frameworks,
-                        languages=languages,
-                        docs=docs,
-                    ),
-            source_refs=self._source_refs(manifests + frameworks + languages),
-            primary_class="repository-structure",
-            attributes=self._structure_attributes(
-                manifests,
-                frameworks,
-                        languages,
-                    ),
-                    evidence=self._evidence(tests, examples, docs),
-                )
+                self._interface_capability(interfaces, tests, examples, docs, chunks)
            )

        return [
@@ -356,7 +330,10 @@ class CandidateGraphGenerator:
                    continue
                if line.startswith("#"):
                    heading = line.lstrip("#").strip().lower()
-                    in_capability_section = "capabilit" in heading
+                    in_capability_section = (
+                        "capabilit" in heading
+                        or heading in {"primary utility", "core utility"}
+                    )
                    continue
                if not in_capability_section:
                    continue
@@ -367,11 +344,23 @@ class CandidateGraphGenerator:
        return items

    def _intent_capability_name(self, text: str) -> str:
+        lowered = re.sub(r"[*_`]", "", text.lower())
+        if "continuous connectivity" in lowered and "remote systems" in lowered:
+            return "Maintain Continuous Connectivity Between Remote Systems And Central Hub"
+        if "observable" in lowered and "auditable" in lowered and "controllable" in lowered:
+            return "Make Connectivity Observable Auditable And Controllable"
+        if "cli tool" in lowered and "mcp" in lowered:
+            return "Expose CLI And MCP Accessible Service"
        candidate = re.split(r"\s+-\s+|\s*:\s*|[.!?]\s+", text.strip(), maxsplit=1)[0]
        candidate = candidate.strip(" .:-")
        if not candidate:
            return ""
-        return self._title_from_words(candidate.split()[:8])
+        words = candidate.split()
+        if words:
+            words[0] = self._imperative_verb(words[0])
+        while words and words[-1].lower().strip(",;:") in {"a", "an", "the", "and", "or", "as", "both"}:
+            words.pop()
+        return self._title_from_words(words[:10])

    def _interface_features(
        self,
@@ -508,16 +497,36 @@ class CandidateGraphGenerator:
            [
                repository.name,
                repository.description or "",
-                " ".join(chunk.text[:600] for chunk in chunks if chunk.kind == "documentation"),
-                " ".join(f"{fact.kind} {fact.name} {fact.value}" for fact in facts),
+                " ".join(
+                    chunk.text[:600]
+                    for chunk in chunks
+                    if chunk.kind in {"intent", "documentation"}
+                    and chunk.metadata.get("source_role") != "agent_guidance"
+                ),
+                " ".join(
+                    f"{fact.kind} {fact.name} {fact.value}"
+                    for fact in facts
+                    if not (
+                        fact.kind == "llm_provider"
+                        and self._utility_relationship(fact) not in {"owned", "facade", "adapter"}
+                    )
+                ),
            ]
        ).lower()
        attributes: list[str] = []
-        if any(token in text for token in ("repository", "repo", "registry")):
-            attributes.append("repository")
+        if any(token in text for token in ("ssh", "tunnel", "reverse tunnel", "remote access", "connectivity")):
+            attributes.extend(["remote-access", "connectivity"])
+            if any(token in text for token in ("audit", "health check", "lifecycle", "ops", "operator")):
+                attributes.append("operations")
+            return "it-operations", self._unique(attributes)
        if any(token in text for token in ("ability", "capability", "feature")):
            return "repository-intelligence", self._unique(attributes + ["capability-mapping"])
-        if any(token in text for token in ("llm", "openrouter", "claude", "model provider")):
+        promotable_llm = any(
+            fact.kind == "llm_provider"
+            and self._utility_relationship(fact) in {"owned", "facade", "adapter"}
+            for fact in facts
+        )
+        if promotable_llm:
            return "ai-integration", self._unique(attributes + ["llm-provider"])
        if any(fact.kind == "interface" for fact in facts):
            attributes.append("interface")
@@ -777,6 +786,9 @@ class CandidateGraphGenerator:
        repository: Repository,
        chunks: list[ContentChunk],
    ) -> str:
+        ops_name = self._operations_ability_name(chunks)
+        if ops_name:
+            return ops_name
        purpose_text = self._document_purpose_sentence(chunks) or repository.description
        if purpose_text:
            normalized = self._imperative_purpose(purpose_text)
@@ -794,9 +806,24 @@ class CandidateGraphGenerator:
                return paragraph
        return ""

+    def _operations_ability_name(self, chunks: list[ContentChunk]) -> str:
+        text = " ".join(
+            chunk.text
+            for chunk in self._documentation_chunks(chunks)
+            if chunk.kind == "intent"
+        ).lower()
+        if "ssh reverse tunnel" in text or "ssh reverse tunneling" in text:
+            return "Manage SSH Reverse Tunnel Connectivity"
+        return ""
+
    def _imperative_purpose(self, text: str) -> str:
        cleaned = re.sub(r"\s+", " ", text.strip())
        cleaned = re.split(r"[.!?]\s+", cleaned, maxsplit=1)[0]
+        cleaned = re.sub(
+            r"(?i)^this\s+repository\s+exists\s+to\s+provide\s+(?:an?\s+)?",
+            "Provide ",
+            cleaned,
+        )
        cleaned = re.sub(r"^[A-Z][A-Za-z0-9_-]*\s+(?:is|provides|offers)\s+", "", cleaned)
        cleaned = cleaned.strip(" .:-")
        if not cleaned:
@@ -816,6 +843,8 @@ class CandidateGraphGenerator:
        }
        if lower in irregular:
            return irregular[lower]
+        if lower in {"this"}:
+            return lower
        if lower.endswith("ies") and len(lower) > 4:
            return f"{lower[:-3]}y"
        if lower.endswith(("des", "ses", "tes", "ves", "zes")) and len(lower) > 4:
--- a/src/repo_registry/repo_scanning/scanner.py
+++ b/src/repo_registry/repo_scanning/scanner.py
@@ -474,7 +474,11 @@ class DeterministicScanner:
            return "ci_tooling"
        if lower.startswith(("tests/", "test/")) or name.startswith("test_"):
            return "test_evidence"
-        if name.startswith("readme") or lower.startswith(("docs/", "doc/", "wiki/")):
+        if (
+            name.startswith("readme")
+            or name.endswith(".md")
+            or lower.startswith(("docs/", "doc/", "wiki/", "workplans/", "architecture/"))
+        ):
            return "product_documentation"
        if name in MANIFEST_FRAMEWORK_HINTS or name.endswith((".lock", ".mod")):
            return "dependency_declaration"
@@ -483,13 +487,21 @@ class DeterministicScanner:
        return "implementation_source"

    def _has_provider_signal(self, lower_text: str, needle: str) -> bool:
-        pattern = re.compile(rf"(?<![a-z0-9-]){re.escape(needle.lower())}(?![a-z0-9-])")
+        if f"{needle.lower()}_api_key" in lower_text:
+            return True
+        pattern = re.compile(rf"(?<![a-z0-9_-]){re.escape(needle.lower())}(?![a-z0-9_-])")
        for match in pattern.finditer(lower_text):
            context = lower_text[max(0, match.start() - 20) : match.end() + 20]
            if needle == "claude" and (
                "claude.md" in context
                or "claude code" in context
                or "claude.ai/code" in context
+                or "claude mcp" in context
+                or "mcp" in context
+                or ".claude" in context
+                or "claude.json" in context
+                or "claude plugin" in context
+                or "claude prompt" in context
            ):
                continue
            return True
--- a/src/repo_registry/web_ui/views.py
+++ b/src/repo_registry/web_ui/views.py
@@ -541,6 +541,19 @@ def render_analysis_diagnostics(
                ),
            )
        )
+    elif capability_count == 0:
+        notices.append(
+            (
+                "warn",
+                "No domain capabilities were produced.",
+                (
+                    "The scanner found repository evidence, but only baseline "
+                    "context or weak documentation was available. If this "
+                    "repository should provide concrete capabilities, record an "
+                    "expectation gap for the missing behavior."
+                ),
+            )
+        )
    elif only_weak_candidates:
        notices.append(
            (