baseline repo characteristics no longer crowd the candidate graph

This commit is contained in:
2026-05-03 00:14:59 +02:00
parent 4672ac6edc
commit 6c4b0e6dcb
7 changed files with 338 additions and 64 deletions

View File

@@ -100,10 +100,6 @@ class CandidateGraphGenerator:
)
capabilities: list[CandidateCapabilityDraft] = []
if interfaces:
capabilities.append(
self._interface_capability(interfaces, tests, examples, docs, chunks)
)
capabilities.extend(
self._intent_capabilities(intent_facts, chunks, tests, examples, docs)
)
@@ -127,31 +123,9 @@ class CandidateGraphGenerator:
docs,
)
)
if manifests or frameworks or languages:
if interfaces and not capabilities:
capabilities.append(
CandidateCapabilityDraft(
name="Describe Repository Structure",
description=(
"Summarize detected languages, package manifests, and framework "
"hints as structural context for review."
),
inputs=[],
outputs=["repository structure summary"],
confidence=self._structure_confidence(
manifests=manifests,
frameworks=frameworks,
languages=languages,
docs=docs,
),
source_refs=self._source_refs(manifests + frameworks + languages),
primary_class="repository-structure",
attributes=self._structure_attributes(
manifests,
frameworks,
languages,
),
evidence=self._evidence(tests, examples, docs),
)
self._interface_capability(interfaces, tests, examples, docs, chunks)
)
return [
@@ -356,7 +330,10 @@ class CandidateGraphGenerator:
continue
if line.startswith("#"):
heading = line.lstrip("#").strip().lower()
in_capability_section = "capabilit" in heading
in_capability_section = (
"capabilit" in heading
or heading in {"primary utility", "core utility"}
)
continue
if not in_capability_section:
continue
@@ -367,11 +344,23 @@ class CandidateGraphGenerator:
return items
def _intent_capability_name(self, text: str) -> str:
lowered = re.sub(r"[*_`]", "", text.lower())
if "continuous connectivity" in lowered and "remote systems" in lowered:
return "Maintain Continuous Connectivity Between Remote Systems And Central Hub"
if "observable" in lowered and "auditable" in lowered and "controllable" in lowered:
return "Make Connectivity Observable Auditable And Controllable"
if "cli tool" in lowered and "mcp" in lowered:
return "Expose CLI And MCP Accessible Service"
candidate = re.split(r"\s+-\s+|\s*:\s*|[.!?]\s+", text.strip(), maxsplit=1)[0]
candidate = candidate.strip(" .:-")
if not candidate:
return ""
return self._title_from_words(candidate.split()[:8])
words = candidate.split()
if words:
words[0] = self._imperative_verb(words[0])
while words and words[-1].lower().strip(",;:") in {"a", "an", "the", "and", "or", "as", "both"}:
words.pop()
return self._title_from_words(words[:10])
def _interface_features(
self,
@@ -508,16 +497,36 @@ class CandidateGraphGenerator:
[
repository.name,
repository.description or "",
" ".join(chunk.text[:600] for chunk in chunks if chunk.kind == "documentation"),
" ".join(f"{fact.kind} {fact.name} {fact.value}" for fact in facts),
" ".join(
chunk.text[:600]
for chunk in chunks
if chunk.kind in {"intent", "documentation"}
and chunk.metadata.get("source_role") != "agent_guidance"
),
" ".join(
f"{fact.kind} {fact.name} {fact.value}"
for fact in facts
if not (
fact.kind == "llm_provider"
and self._utility_relationship(fact) not in {"owned", "facade", "adapter"}
)
),
]
).lower()
attributes: list[str] = []
if any(token in text for token in ("repository", "repo", "registry")):
attributes.append("repository")
if any(token in text for token in ("ssh", "tunnel", "reverse tunnel", "remote access", "connectivity")):
attributes.extend(["remote-access", "connectivity"])
if any(token in text for token in ("audit", "health check", "lifecycle", "ops", "operator")):
attributes.append("operations")
return "it-operations", self._unique(attributes)
if any(token in text for token in ("ability", "capability", "feature")):
return "repository-intelligence", self._unique(attributes + ["capability-mapping"])
if any(token in text for token in ("llm", "openrouter", "claude", "model provider")):
promotable_llm = any(
fact.kind == "llm_provider"
and self._utility_relationship(fact) in {"owned", "facade", "adapter"}
for fact in facts
)
if promotable_llm:
return "ai-integration", self._unique(attributes + ["llm-provider"])
if any(fact.kind == "interface" for fact in facts):
attributes.append("interface")
@@ -777,6 +786,9 @@ class CandidateGraphGenerator:
repository: Repository,
chunks: list[ContentChunk],
) -> str:
ops_name = self._operations_ability_name(chunks)
if ops_name:
return ops_name
purpose_text = self._document_purpose_sentence(chunks) or repository.description
if purpose_text:
normalized = self._imperative_purpose(purpose_text)
@@ -794,9 +806,24 @@ class CandidateGraphGenerator:
return paragraph
return ""
def _operations_ability_name(self, chunks: list[ContentChunk]) -> str:
text = " ".join(
chunk.text
for chunk in self._documentation_chunks(chunks)
if chunk.kind == "intent"
).lower()
if "ssh reverse tunnel" in text or "ssh reverse tunneling" in text:
return "Manage SSH Reverse Tunnel Connectivity"
return ""
def _imperative_purpose(self, text: str) -> str:
cleaned = re.sub(r"\s+", " ", text.strip())
cleaned = re.split(r"[.!?]\s+", cleaned, maxsplit=1)[0]
cleaned = re.sub(
r"(?i)^this\s+repository\s+exists\s+to\s+provide\s+(?:an?\s+)?",
"Provide ",
cleaned,
)
cleaned = re.sub(r"^[A-Z][A-Za-z0-9_-]*\s+(?:is|provides|offers)\s+", "", cleaned)
cleaned = cleaned.strip(" .:-")
if not cleaned:
@@ -816,6 +843,8 @@ class CandidateGraphGenerator:
}
if lower in irregular:
return irregular[lower]
if lower in {"this"}:
return lower
if lower.endswith("ies") and len(lower) > 4:
return f"{lower[:-3]}y"
if lower.endswith(("des", "ses", "tes", "ves", "zes")) and len(lower) > 4:

View File

@@ -474,7 +474,11 @@ class DeterministicScanner:
return "ci_tooling"
if lower.startswith(("tests/", "test/")) or name.startswith("test_"):
return "test_evidence"
if name.startswith("readme") or lower.startswith(("docs/", "doc/", "wiki/")):
if (
name.startswith("readme")
or name.endswith(".md")
or lower.startswith(("docs/", "doc/", "wiki/", "workplans/", "architecture/"))
):
return "product_documentation"
if name in MANIFEST_FRAMEWORK_HINTS or name.endswith((".lock", ".mod")):
return "dependency_declaration"
@@ -483,13 +487,21 @@ class DeterministicScanner:
return "implementation_source"
def _has_provider_signal(self, lower_text: str, needle: str) -> bool:
pattern = re.compile(rf"(?<![a-z0-9-]){re.escape(needle.lower())}(?![a-z0-9-])")
if f"{needle.lower()}_api_key" in lower_text:
return True
pattern = re.compile(rf"(?<![a-z0-9_-]){re.escape(needle.lower())}(?![a-z0-9_-])")
for match in pattern.finditer(lower_text):
context = lower_text[max(0, match.start() - 20) : match.end() + 20]
if needle == "claude" and (
"claude.md" in context
or "claude code" in context
or "claude.ai/code" in context
or "claude mcp" in context
or "mcp" in context
or ".claude" in context
or "claude.json" in context
or "claude plugin" in context
or "claude prompt" in context
):
continue
return True

View File

@@ -541,6 +541,19 @@ def render_analysis_diagnostics(
),
)
)
elif capability_count == 0:
notices.append(
(
"warn",
"No domain capabilities were produced.",
(
"The scanner found repository evidence, but only baseline "
"context or weak documentation was available. If this "
"repository should provide concrete capabilities, record an "
"expectation gap for the missing behavior."
),
)
)
elif only_weak_candidates:
notices.append(
(

View File

@@ -163,3 +163,62 @@ def write_dependency_only_repo(root: Path) -> Path:
encoding="utf-8",
)
return repo
def write_ops_bridge_like_repo(root: Path) -> Path:
repo = root / "ops-bridge-like"
repo.mkdir()
(repo / "INTENT.md").write_text(
"# INTENT\n\n"
"## Purpose\n\n"
"This repository exists to provide a **reliable, inspectable, and controllable "
"connectivity layer** between distributed dev, build, test and execution "
"environments for dev and ops personal human and agentic.\n\n"
"## Primary Utility\n\n"
"The repository provides a **managed SSH reverse tunneling system** that:\n\n"
"* Maintains continuous connectivity between remote systems and a central hub\n"
"* Makes connectivity **observable, auditable, and controllable**\n"
"* Exposes this capability as both a **CLI tool and an MCP-accessible service**\n\n"
"## Intended Users\n\n"
"* Human operators managing infrastructure and connectivity\n"
"* LLM-based agents requiring stable access to local services\n",
encoding="utf-8",
)
(repo / "SCOPE.md").write_text(
"# SCOPE\n\nSSH reverse tunnel lifecycle manager for remote execution environments.\n",
encoding="utf-8",
)
(repo / "README.txt").write_text(
"# Ops Bridge\n\n"
"Manages named SSH reverse tunnels with auto-reconnect, health checks, "
"audit logging, and an MCP server so Claude Code can start and inspect tunnels.\n",
encoding="utf-8",
)
(repo / "pyproject.toml").write_text(
"[project]\ndependencies = ['typer', 'pytest']\n",
encoding="utf-8",
)
(repo / "scripts").mkdir()
(repo / "scripts" / "register_mcp.py").write_text(
'"""Register the ops-bridge MCP server with Claude MCP."""\n'
"from pathlib import Path\n"
"CLAUDE_JSON = Path.home() / '.claude.json'\n"
"def main():\n"
" return CLAUDE_JSON.exists()\n",
encoding="utf-8",
)
(repo / "bridge").mkdir()
(repo / "bridge" / "cli.py").write_text(
"import typer\n"
"app = typer.Typer()\n"
"@app.command()\n"
"def up(name: str):\n"
" return name\n",
encoding="utf-8",
)
(repo / "workplans").mkdir()
(repo / "workplans" / "BRIDGE-WP-0003.md").write_text(
"# MCP skill work\n\nClaude Code sessions can call bridge_status().\n",
encoding="utf-8",
)
return repo

View File

@@ -68,12 +68,7 @@ def test_candidate_generator_builds_purpose_seed_from_observed_facts():
assert interface_capability.features[0].name == "POST /classify"
assert interface_capability.features[0].location == "app.py"
assert interface_capability.evidence[0].strength == "strong"
structure_capability = ability.capabilities[1]
assert structure_capability.name == "Describe Repository Structure"
assert {
"utility-dependency",
"review-required-structural-context",
} <= set(structure_capability.attributes)
assert len(ability.capabilities) == 1
def test_candidate_generator_extracts_intended_capability_blocks_from_intent_chunks():
@@ -265,7 +260,7 @@ def test_candidate_confidence_scoring_increases_with_supporting_facts():
assert graph[0].confidence == 1.0
assert graph[0].capabilities[0].confidence == 0.85
assert graph[0].capabilities[1].confidence == 0.75
assert len(graph[0].capabilities) == 1
def test_candidate_generator_names_cli_features_from_nearby_function():

View File

@@ -15,6 +15,7 @@ from tests.fixtures import (
write_javascript_typescript_package_repo,
write_key_cape_like_repo,
write_llm_connect_like_repo,
write_ops_bridge_like_repo,
write_misleading_docs_repo,
write_python_cli_repo,
write_readme_only_repo,
@@ -27,6 +28,32 @@ def make_service(tmp_path):
return RegistryService(store, ingestion=GitIngestionService(tmp_path / "checkouts"))
def add_candidate_capability(service, repository_id, analysis_run_id, ability_id, name):
with service.store.connect() as connection:
cursor = connection.execute(
"""
INSERT INTO candidate_capabilities
(repository_id, analysis_run_id, ability_id, name, description,
inputs, outputs, primary_class, attributes, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
analysis_run_id,
ability_id,
name,
"Review target capability inserted for review workflow tests.",
"[]",
"[]",
"test-capability",
json.dumps(["test-review-target"]),
0.5,
"[]",
),
)
return int(cursor.lastrowid)
class FakeLLMExtractor:
def __init__(self, abilities):
self.abilities = abilities
@@ -298,12 +325,12 @@ def test_search_filters_by_status_language_and_framework(tmp_path):
service.approve_candidate_graph(repository.id, summary.analysis_run.id)
results = service.search(
"repository",
"health",
status="indexed",
language="Python",
framework="FastAPI",
ability="Support Filterable",
capability="Repository Structure",
capability="Repository Interface",
)
wrong_language_results = service.search(
"repository",
@@ -380,7 +407,6 @@ def test_fixture_breadth_javascript_typescript_package_extracts_structure_and_ap
assert ("framework", "React", "package.json") in fact_names
assert ("framework", "Vitest", "package.json") in fact_names
assert "Expose Repository Interface" in capability_names
assert "Describe Repository Structure" in capability_names
assert "API" in feature_types
assert service.ability_map(repository.id).abilities == []
@@ -468,10 +494,36 @@ def test_regression_dependency_only_repo_keeps_libraries_as_context(tmp_path):
for capability in ability.capabilities
}
assert "Route LLM Requests Across Providers" not in capability_names
assert capability_names == {"Describe Repository Structure"}
structure = graph.abilities[0].capabilities[0]
assert "utility-dependency" in structure.attributes
assert "review-required-structural-context" in structure.attributes
assert capability_names == set()
assert any(fact.kind == "manifest" for fact in summary.facts)
def test_regression_ops_bridge_like_repo_is_it_operations_not_llm_provider(tmp_path):
source = write_ops_bridge_like_repo(tmp_path)
service = make_service(tmp_path)
repository = service.register_repository(name="Ops Bridge Marketing Name", url=str(source))
summary = service.analyze_repository(repository.id, use_llm_assistance=False)
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
ability = graph.abilities[0]
capability_names = {
capability.name
for candidate_ability in graph.abilities
for capability in candidate_ability.capabilities
}
facts = {(fact.kind, fact.name, fact.path) for fact in summary.facts}
assert ability.name == "Manage SSH Reverse Tunnel Connectivity"
assert ability.primary_class == "it-operations"
assert {"remote-access", "connectivity", "operations"} <= set(ability.attributes)
assert "repository" not in ability.attributes
assert "llm-provider" not in ability.attributes
assert "Route LLM Requests Across Providers" not in capability_names
assert "Maintain Continuous Connectivity Between Remote Systems And Central Hub" in capability_names
assert "Make Connectivity Observable Auditable And Controllable" in capability_names
assert "Expose CLI And MCP Accessible Service" in capability_names
assert ("llm_provider", "Claude", "scripts/register_mcp.py") not in facts
assert ("llm_provider", "Claude", "workplans/BRIDGE-WP-0003.md") not in facts
def test_fixture_breadth_empty_repo_produces_no_candidate_claims(tmp_path):
@@ -933,20 +985,14 @@ def test_analyze_repository_can_trusted_auto_approve_candidates(tmp_path):
for capability in graph.abilities[0].capabilities
}
assert statuses_by_capability["Expose Repository Interface"] == "approved"
assert statuses_by_capability["Describe Repository Structure"] == "candidate"
assert ability_map.abilities[0].name == "Report Health Over HTTP"
assert decisions[0].action == "trusted_auto_approve_candidate_graph"
assert "deterministic candidate generation" in decisions[0].notes
assert "Auto-approved 1 safe candidate capability(s); left 1 for review." in decisions[0].notes
assert "Auto-approved 1 safe candidate capability(s); left 0 for review." in decisions[0].notes
assert (
"Approved: Expose Repository Interface: owned interface with sufficient confidence."
in decisions[0].notes
)
assert (
"Skipped: Describe Repository Structure: structural/dependency context "
"requires curator review."
in decisions[0].notes
)
def test_rebuild_characteristics_dry_run_preserves_approved_map(tmp_path):
@@ -1523,6 +1569,14 @@ def test_relink_candidate_feature_and_evidence_to_another_capability(tmp_path):
repository = service.register_repository(name="Relink Leaves", url=str(source))
summary = service.analyze_repository(repository.id)
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
add_candidate_capability(
service,
repository.id,
summary.analysis_run.id,
graph.abilities[0].id,
"Review Target Capability",
)
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
source_capability = graph.abilities[0].capabilities[0]
target_capability = graph.abilities[0].capabilities[1]
feature = source_capability.features[0]
@@ -1625,6 +1679,14 @@ def test_merge_candidate_capability_moves_children_to_target(tmp_path):
repository = service.register_repository(name="Merge Capability", url=str(source))
summary = service.analyze_repository(repository.id)
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
add_candidate_capability(
service,
repository.id,
summary.analysis_run.id,
graph.abilities[0].id,
"Review Target Capability",
)
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
source_capability = graph.abilities[0].capabilities[0]
target_capability = graph.abilities[0].capabilities[1]

View File

@@ -1,9 +1,47 @@
import json
import sqlite3
from fastapi.testclient import TestClient
from repo_registry.web_api import app as app_module
from repo_registry.web_api.app import Settings, app, get_service, get_settings
def add_candidate_capability(database_path, repository_id, analysis_run_id, name):
with sqlite3.connect(database_path) as connection:
ability_id = connection.execute(
"""
SELECT id FROM candidate_abilities
WHERE repository_id = ? AND analysis_run_id = ?
ORDER BY id
LIMIT 1
""",
(repository_id, analysis_run_id),
).fetchone()[0]
cursor = connection.execute(
"""
INSERT INTO candidate_capabilities
(repository_id, analysis_run_id, ability_id, name, description,
inputs, outputs, primary_class, attributes, confidence, source_refs)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
""",
(
repository_id,
analysis_run_id,
ability_id,
name,
"Review target capability inserted for API review workflow tests.",
"[]",
"[]",
"test-capability",
json.dumps(["test-review-target"]),
0.5,
"[]",
),
)
return int(cursor.lastrowid)
def test_openapi_groups_agent_facing_endpoints():
client = TestClient(app)
@@ -875,10 +913,11 @@ def test_api_analysis_run_loop(tmp_path):
'{"dependencies":{"react":"latest","vite":"latest"}}',
encoding="utf-8",
)
database_path = str(tmp_path / "api-analysis.sqlite3")
def override_settings():
return Settings(
database_path=str(tmp_path / "api-analysis.sqlite3"),
database_path=database_path,
checkout_root=str(tmp_path / "api-checkouts"),
)
@@ -903,6 +942,12 @@ def test_api_analysis_run_loop(tmp_path):
assert get_run_response.status_code == 200
assert get_run_response.json()["id"] == run["analysis_run"]["id"]
add_candidate_capability(
database_path,
repository_id,
run["analysis_run"]["id"],
"Describe Frontend Stack",
)
candidate_response = client.get(
f"/repos/{repository_id}/analysis-runs/"
f"{run['analysis_run']['id']}/candidate-graph"
@@ -954,6 +999,12 @@ def test_api_analysis_run_loop(tmp_path):
run_response = client.post(f"/repos/{repository_id}/analysis-runs", json={})
assert run_response.status_code == 201
run = run_response.json()
add_candidate_capability(
database_path,
repository_id,
run["analysis_run"]["id"],
"Describe Frontend Stack",
)
candidate_response = client.get(
f"/repos/{repository_id}/analysis-runs/"
f"{run['analysis_run']['id']}/candidate-graph"
@@ -1358,7 +1409,7 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
assert "Analysis completed with reviewable results." in run_detail.text
assert "Candidate Graph" in run_detail.text
assert "1 abilities" in run_detail.text
assert "2 capabilities" in run_detail.text
assert "1 capabilities" in run_detail.text
assert "2 features" in run_detail.text
assert "8 facts" in run_detail.text
assert "Content Chunks" in run_detail.text
@@ -1426,11 +1477,11 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
assert "1 scope" in approved_detail.text
assert "supports" in approved_detail.text
assert "1 abilities" in approved_detail.text
assert "2 capabilities" in approved_detail.text
assert "1 capabilities" in approved_detail.text
assert "2 features" in approved_detail.text
assert "Latest Candidate Graph" in approved_detail.text
assert "1 candidate abilities" in approved_detail.text
assert "2 candidate capabilities" in approved_detail.text
assert "1 candidate capabilities" in approved_detail.text
assert "2 candidate features" in approved_detail.text
assert "8 candidate facts" in approved_detail.text
assert "Use Approved Registry" in approved_detail.text
@@ -1801,6 +1852,44 @@ def test_ui_analysis_run_diagnostics_explain_failures_and_empty_results(tmp_path
app.dependency_overrides.clear()
def test_ui_analysis_run_diagnostics_warn_when_only_baseline_context_exists(tmp_path):
source = tmp_path / "dependency-only-ui"
source.mkdir()
(source / "README.md").write_text("# Dependency Only\nUses libraries.\n", encoding="utf-8")
(source / "requirements.txt").write_text("fastapi\npytest\n", encoding="utf-8")
def override_settings():
return Settings(
database_path=str(tmp_path / "ui-baseline-diagnostics.sqlite3"),
checkout_root=str(tmp_path / "ui-baseline-diagnostics-checkouts"),
)
app.dependency_overrides[get_settings] = override_settings
client = TestClient(app)
try:
repository = client.post(
"/repos",
json={
"url": str(source),
"name": "Dependency Only UI",
"description": "Used for baseline diagnostics.",
},
).json()
run = client.post(
f"/ui/repos/{repository['id']}/analysis-runs",
data={"source_path": "", "use_llm_assistance": ""},
follow_redirects=False,
)
detail = client.get(run.headers["location"])
assert detail.status_code == 200
assert "No domain capabilities were produced." in detail.text
assert "only baseline context or weak documentation was available" in detail.text
finally:
app.dependency_overrides.clear()
def test_ui_register_and_explore_lands_on_analysis_result(tmp_path):
source = tmp_path / "explore-repo"
source.mkdir()
@@ -2550,9 +2639,11 @@ def test_api_relinks_candidate_feature_and_evidence(tmp_path):
" return {}\n",
encoding="utf-8",
)
database_path = str(tmp_path / "api-relink.sqlite3")
def override_settings():
return Settings(
database_path=str(tmp_path / "api-relink.sqlite3"),
database_path=database_path,
checkout_root=str(tmp_path / "api-relink-checkouts"),
)
@@ -2566,6 +2657,12 @@ def test_api_relinks_candidate_feature_and_evidence(tmp_path):
repository_id = repository_response.json()["id"]
run_response = client.post(f"/repos/{repository_id}/analysis-runs", json={})
run_id = run_response.json()["analysis_run"]["id"]
add_candidate_capability(
database_path,
repository_id,
run_id,
"Review Target Capability",
)
graph_response = client.get(
f"/repos/{repository_id}/analysis-runs/{run_id}/candidate-graph"
)
@@ -2631,10 +2728,11 @@ def test_api_merges_candidate_capability_feature_and_evidence(tmp_path):
" click.echo('ok')\n",
encoding="utf-8",
)
database_path = str(tmp_path / "api-merge.sqlite3")
def override_settings():
return Settings(
database_path=str(tmp_path / "api-merge.sqlite3"),
database_path=database_path,
checkout_root=str(tmp_path / "api-merge-checkouts"),
)
@@ -2648,6 +2746,12 @@ def test_api_merges_candidate_capability_feature_and_evidence(tmp_path):
repository_id = repository_response.json()["id"]
run_response = client.post(f"/repos/{repository_id}/analysis-runs", json={})
run_id = run_response.json()["analysis_run"]["id"]
add_candidate_capability(
database_path,
repository_id,
run_id,
"Review Target Capability",
)
graph_response = client.get(
f"/repos/{repository_id}/analysis-runs/{run_id}/candidate-graph"
)