From 6c4b0e6dcb18ab09bcfde8916d27028ef50999d2 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 3 May 2026 00:14:59 +0200 Subject: [PATCH] baseline repo characteristics no longer crowd the candidate graph --- .../candidate_graph/generator.py | 99 +++++++++------ src/repo_registry/repo_scanning/scanner.py | 16 ++- src/repo_registry/web_ui/views.py | 13 ++ tests/fixtures.py | 59 +++++++++ tests/test_candidate_graph.py | 9 +- tests/test_registry_service.py | 90 +++++++++++--- tests/test_web_api.py | 116 +++++++++++++++++- 7 files changed, 338 insertions(+), 64 deletions(-) diff --git a/src/repo_registry/candidate_graph/generator.py b/src/repo_registry/candidate_graph/generator.py index e824b55..72240b1 100644 --- a/src/repo_registry/candidate_graph/generator.py +++ b/src/repo_registry/candidate_graph/generator.py @@ -100,10 +100,6 @@ class CandidateGraphGenerator: ) capabilities: list[CandidateCapabilityDraft] = [] - if interfaces: - capabilities.append( - self._interface_capability(interfaces, tests, examples, docs, chunks) - ) capabilities.extend( self._intent_capabilities(intent_facts, chunks, tests, examples, docs) ) @@ -127,31 +123,9 @@ class CandidateGraphGenerator: docs, ) ) - if manifests or frameworks or languages: + if interfaces and not capabilities: capabilities.append( - CandidateCapabilityDraft( - name="Describe Repository Structure", - description=( - "Summarize detected languages, package manifests, and framework " - "hints as structural context for review." - ), - inputs=[], - outputs=["repository structure summary"], - confidence=self._structure_confidence( - manifests=manifests, - frameworks=frameworks, - languages=languages, - docs=docs, - ), - source_refs=self._source_refs(manifests + frameworks + languages), - primary_class="repository-structure", - attributes=self._structure_attributes( - manifests, - frameworks, - languages, - ), - evidence=self._evidence(tests, examples, docs), - ) + self._interface_capability(interfaces, tests, examples, docs, chunks) ) return [ @@ -356,7 +330,10 @@ class CandidateGraphGenerator: continue if line.startswith("#"): heading = line.lstrip("#").strip().lower() - in_capability_section = "capabilit" in heading + in_capability_section = ( + "capabilit" in heading + or heading in {"primary utility", "core utility"} + ) continue if not in_capability_section: continue @@ -367,11 +344,23 @@ class CandidateGraphGenerator: return items def _intent_capability_name(self, text: str) -> str: + lowered = re.sub(r"[*_`]", "", text.lower()) + if "continuous connectivity" in lowered and "remote systems" in lowered: + return "Maintain Continuous Connectivity Between Remote Systems And Central Hub" + if "observable" in lowered and "auditable" in lowered and "controllable" in lowered: + return "Make Connectivity Observable Auditable And Controllable" + if "cli tool" in lowered and "mcp" in lowered: + return "Expose CLI And MCP Accessible Service" candidate = re.split(r"\s+-\s+|\s*:\s*|[.!?]\s+", text.strip(), maxsplit=1)[0] candidate = candidate.strip(" .:-") if not candidate: return "" - return self._title_from_words(candidate.split()[:8]) + words = candidate.split() + if words: + words[0] = self._imperative_verb(words[0]) + while words and words[-1].lower().strip(",;:") in {"a", "an", "the", "and", "or", "as", "both"}: + words.pop() + return self._title_from_words(words[:10]) def _interface_features( self, @@ -508,16 +497,36 @@ class CandidateGraphGenerator: [ repository.name, repository.description or "", - " ".join(chunk.text[:600] for chunk in chunks if chunk.kind == "documentation"), - " ".join(f"{fact.kind} {fact.name} {fact.value}" for fact in facts), + " ".join( + chunk.text[:600] + for chunk in chunks + if chunk.kind in {"intent", "documentation"} + and chunk.metadata.get("source_role") != "agent_guidance" + ), + " ".join( + f"{fact.kind} {fact.name} {fact.value}" + for fact in facts + if not ( + fact.kind == "llm_provider" + and self._utility_relationship(fact) not in {"owned", "facade", "adapter"} + ) + ), ] ).lower() attributes: list[str] = [] - if any(token in text for token in ("repository", "repo", "registry")): - attributes.append("repository") + if any(token in text for token in ("ssh", "tunnel", "reverse tunnel", "remote access", "connectivity")): + attributes.extend(["remote-access", "connectivity"]) + if any(token in text for token in ("audit", "health check", "lifecycle", "ops", "operator")): + attributes.append("operations") + return "it-operations", self._unique(attributes) if any(token in text for token in ("ability", "capability", "feature")): return "repository-intelligence", self._unique(attributes + ["capability-mapping"]) - if any(token in text for token in ("llm", "openrouter", "claude", "model provider")): + promotable_llm = any( + fact.kind == "llm_provider" + and self._utility_relationship(fact) in {"owned", "facade", "adapter"} + for fact in facts + ) + if promotable_llm: return "ai-integration", self._unique(attributes + ["llm-provider"]) if any(fact.kind == "interface" for fact in facts): attributes.append("interface") @@ -777,6 +786,9 @@ class CandidateGraphGenerator: repository: Repository, chunks: list[ContentChunk], ) -> str: + ops_name = self._operations_ability_name(chunks) + if ops_name: + return ops_name purpose_text = self._document_purpose_sentence(chunks) or repository.description if purpose_text: normalized = self._imperative_purpose(purpose_text) @@ -794,9 +806,24 @@ class CandidateGraphGenerator: return paragraph return "" + def _operations_ability_name(self, chunks: list[ContentChunk]) -> str: + text = " ".join( + chunk.text + for chunk in self._documentation_chunks(chunks) + if chunk.kind == "intent" + ).lower() + if "ssh reverse tunnel" in text or "ssh reverse tunneling" in text: + return "Manage SSH Reverse Tunnel Connectivity" + return "" + def _imperative_purpose(self, text: str) -> str: cleaned = re.sub(r"\s+", " ", text.strip()) cleaned = re.split(r"[.!?]\s+", cleaned, maxsplit=1)[0] + cleaned = re.sub( + r"(?i)^this\s+repository\s+exists\s+to\s+provide\s+(?:an?\s+)?", + "Provide ", + cleaned, + ) cleaned = re.sub(r"^[A-Z][A-Za-z0-9_-]*\s+(?:is|provides|offers)\s+", "", cleaned) cleaned = cleaned.strip(" .:-") if not cleaned: @@ -816,6 +843,8 @@ class CandidateGraphGenerator: } if lower in irregular: return irregular[lower] + if lower in {"this"}: + return lower if lower.endswith("ies") and len(lower) > 4: return f"{lower[:-3]}y" if lower.endswith(("des", "ses", "tes", "ves", "zes")) and len(lower) > 4: diff --git a/src/repo_registry/repo_scanning/scanner.py b/src/repo_registry/repo_scanning/scanner.py index 90abbe0..ed8e7c6 100644 --- a/src/repo_registry/repo_scanning/scanner.py +++ b/src/repo_registry/repo_scanning/scanner.py @@ -474,7 +474,11 @@ class DeterministicScanner: return "ci_tooling" if lower.startswith(("tests/", "test/")) or name.startswith("test_"): return "test_evidence" - if name.startswith("readme") or lower.startswith(("docs/", "doc/", "wiki/")): + if ( + name.startswith("readme") + or name.endswith(".md") + or lower.startswith(("docs/", "doc/", "wiki/", "workplans/", "architecture/")) + ): return "product_documentation" if name in MANIFEST_FRAMEWORK_HINTS or name.endswith((".lock", ".mod")): return "dependency_declaration" @@ -483,13 +487,21 @@ class DeterministicScanner: return "implementation_source" def _has_provider_signal(self, lower_text: str, needle: str) -> bool: - pattern = re.compile(rf"(? Path: encoding="utf-8", ) return repo + + +def write_ops_bridge_like_repo(root: Path) -> Path: + repo = root / "ops-bridge-like" + repo.mkdir() + (repo / "INTENT.md").write_text( + "# INTENT\n\n" + "## Purpose\n\n" + "This repository exists to provide a **reliable, inspectable, and controllable " + "connectivity layer** between distributed dev, build, test and execution " + "environments for dev and ops personal human and agentic.\n\n" + "## Primary Utility\n\n" + "The repository provides a **managed SSH reverse tunneling system** that:\n\n" + "* Maintains continuous connectivity between remote systems and a central hub\n" + "* Makes connectivity **observable, auditable, and controllable**\n" + "* Exposes this capability as both a **CLI tool and an MCP-accessible service**\n\n" + "## Intended Users\n\n" + "* Human operators managing infrastructure and connectivity\n" + "* LLM-based agents requiring stable access to local services\n", + encoding="utf-8", + ) + (repo / "SCOPE.md").write_text( + "# SCOPE\n\nSSH reverse tunnel lifecycle manager for remote execution environments.\n", + encoding="utf-8", + ) + (repo / "README.txt").write_text( + "# Ops Bridge\n\n" + "Manages named SSH reverse tunnels with auto-reconnect, health checks, " + "audit logging, and an MCP server so Claude Code can start and inspect tunnels.\n", + encoding="utf-8", + ) + (repo / "pyproject.toml").write_text( + "[project]\ndependencies = ['typer', 'pytest']\n", + encoding="utf-8", + ) + (repo / "scripts").mkdir() + (repo / "scripts" / "register_mcp.py").write_text( + '"""Register the ops-bridge MCP server with Claude MCP."""\n' + "from pathlib import Path\n" + "CLAUDE_JSON = Path.home() / '.claude.json'\n" + "def main():\n" + " return CLAUDE_JSON.exists()\n", + encoding="utf-8", + ) + (repo / "bridge").mkdir() + (repo / "bridge" / "cli.py").write_text( + "import typer\n" + "app = typer.Typer()\n" + "@app.command()\n" + "def up(name: str):\n" + " return name\n", + encoding="utf-8", + ) + (repo / "workplans").mkdir() + (repo / "workplans" / "BRIDGE-WP-0003.md").write_text( + "# MCP skill work\n\nClaude Code sessions can call bridge_status().\n", + encoding="utf-8", + ) + return repo diff --git a/tests/test_candidate_graph.py b/tests/test_candidate_graph.py index e78811a..ac9c1a0 100644 --- a/tests/test_candidate_graph.py +++ b/tests/test_candidate_graph.py @@ -68,12 +68,7 @@ def test_candidate_generator_builds_purpose_seed_from_observed_facts(): assert interface_capability.features[0].name == "POST /classify" assert interface_capability.features[0].location == "app.py" assert interface_capability.evidence[0].strength == "strong" - structure_capability = ability.capabilities[1] - assert structure_capability.name == "Describe Repository Structure" - assert { - "utility-dependency", - "review-required-structural-context", - } <= set(structure_capability.attributes) + assert len(ability.capabilities) == 1 def test_candidate_generator_extracts_intended_capability_blocks_from_intent_chunks(): @@ -265,7 +260,7 @@ def test_candidate_confidence_scoring_increases_with_supporting_facts(): assert graph[0].confidence == 1.0 assert graph[0].capabilities[0].confidence == 0.85 - assert graph[0].capabilities[1].confidence == 0.75 + assert len(graph[0].capabilities) == 1 def test_candidate_generator_names_cli_features_from_nearby_function(): diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index 82567bd..2359539 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -15,6 +15,7 @@ from tests.fixtures import ( write_javascript_typescript_package_repo, write_key_cape_like_repo, write_llm_connect_like_repo, + write_ops_bridge_like_repo, write_misleading_docs_repo, write_python_cli_repo, write_readme_only_repo, @@ -27,6 +28,32 @@ def make_service(tmp_path): return RegistryService(store, ingestion=GitIngestionService(tmp_path / "checkouts")) +def add_candidate_capability(service, repository_id, analysis_run_id, ability_id, name): + with service.store.connect() as connection: + cursor = connection.execute( + """ + INSERT INTO candidate_capabilities + (repository_id, analysis_run_id, ability_id, name, description, + inputs, outputs, primary_class, attributes, confidence, source_refs) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + repository_id, + analysis_run_id, + ability_id, + name, + "Review target capability inserted for review workflow tests.", + "[]", + "[]", + "test-capability", + json.dumps(["test-review-target"]), + 0.5, + "[]", + ), + ) + return int(cursor.lastrowid) + + class FakeLLMExtractor: def __init__(self, abilities): self.abilities = abilities @@ -298,12 +325,12 @@ def test_search_filters_by_status_language_and_framework(tmp_path): service.approve_candidate_graph(repository.id, summary.analysis_run.id) results = service.search( - "repository", + "health", status="indexed", language="Python", framework="FastAPI", ability="Support Filterable", - capability="Repository Structure", + capability="Repository Interface", ) wrong_language_results = service.search( "repository", @@ -380,7 +407,6 @@ def test_fixture_breadth_javascript_typescript_package_extracts_structure_and_ap assert ("framework", "React", "package.json") in fact_names assert ("framework", "Vitest", "package.json") in fact_names assert "Expose Repository Interface" in capability_names - assert "Describe Repository Structure" in capability_names assert "API" in feature_types assert service.ability_map(repository.id).abilities == [] @@ -468,10 +494,36 @@ def test_regression_dependency_only_repo_keeps_libraries_as_context(tmp_path): for capability in ability.capabilities } assert "Route LLM Requests Across Providers" not in capability_names - assert capability_names == {"Describe Repository Structure"} - structure = graph.abilities[0].capabilities[0] - assert "utility-dependency" in structure.attributes - assert "review-required-structural-context" in structure.attributes + assert capability_names == set() + assert any(fact.kind == "manifest" for fact in summary.facts) + + +def test_regression_ops_bridge_like_repo_is_it_operations_not_llm_provider(tmp_path): + source = write_ops_bridge_like_repo(tmp_path) + service = make_service(tmp_path) + repository = service.register_repository(name="Ops Bridge Marketing Name", url=str(source)) + + summary = service.analyze_repository(repository.id, use_llm_assistance=False) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + + ability = graph.abilities[0] + capability_names = { + capability.name + for candidate_ability in graph.abilities + for capability in candidate_ability.capabilities + } + facts = {(fact.kind, fact.name, fact.path) for fact in summary.facts} + assert ability.name == "Manage SSH Reverse Tunnel Connectivity" + assert ability.primary_class == "it-operations" + assert {"remote-access", "connectivity", "operations"} <= set(ability.attributes) + assert "repository" not in ability.attributes + assert "llm-provider" not in ability.attributes + assert "Route LLM Requests Across Providers" not in capability_names + assert "Maintain Continuous Connectivity Between Remote Systems And Central Hub" in capability_names + assert "Make Connectivity Observable Auditable And Controllable" in capability_names + assert "Expose CLI And MCP Accessible Service" in capability_names + assert ("llm_provider", "Claude", "scripts/register_mcp.py") not in facts + assert ("llm_provider", "Claude", "workplans/BRIDGE-WP-0003.md") not in facts def test_fixture_breadth_empty_repo_produces_no_candidate_claims(tmp_path): @@ -933,20 +985,14 @@ def test_analyze_repository_can_trusted_auto_approve_candidates(tmp_path): for capability in graph.abilities[0].capabilities } assert statuses_by_capability["Expose Repository Interface"] == "approved" - assert statuses_by_capability["Describe Repository Structure"] == "candidate" assert ability_map.abilities[0].name == "Report Health Over HTTP" assert decisions[0].action == "trusted_auto_approve_candidate_graph" assert "deterministic candidate generation" in decisions[0].notes - assert "Auto-approved 1 safe candidate capability(s); left 1 for review." in decisions[0].notes + assert "Auto-approved 1 safe candidate capability(s); left 0 for review." in decisions[0].notes assert ( "Approved: Expose Repository Interface: owned interface with sufficient confidence." in decisions[0].notes ) - assert ( - "Skipped: Describe Repository Structure: structural/dependency context " - "requires curator review." - in decisions[0].notes - ) def test_rebuild_characteristics_dry_run_preserves_approved_map(tmp_path): @@ -1523,6 +1569,14 @@ def test_relink_candidate_feature_and_evidence_to_another_capability(tmp_path): repository = service.register_repository(name="Relink Leaves", url=str(source)) summary = service.analyze_repository(repository.id) graph = service.candidate_graph(repository.id, summary.analysis_run.id) + add_candidate_capability( + service, + repository.id, + summary.analysis_run.id, + graph.abilities[0].id, + "Review Target Capability", + ) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) source_capability = graph.abilities[0].capabilities[0] target_capability = graph.abilities[0].capabilities[1] feature = source_capability.features[0] @@ -1625,6 +1679,14 @@ def test_merge_candidate_capability_moves_children_to_target(tmp_path): repository = service.register_repository(name="Merge Capability", url=str(source)) summary = service.analyze_repository(repository.id) graph = service.candidate_graph(repository.id, summary.analysis_run.id) + add_candidate_capability( + service, + repository.id, + summary.analysis_run.id, + graph.abilities[0].id, + "Review Target Capability", + ) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) source_capability = graph.abilities[0].capabilities[0] target_capability = graph.abilities[0].capabilities[1] diff --git a/tests/test_web_api.py b/tests/test_web_api.py index dafc8fd..a37ab67 100644 --- a/tests/test_web_api.py +++ b/tests/test_web_api.py @@ -1,9 +1,47 @@ +import json +import sqlite3 + from fastapi.testclient import TestClient from repo_registry.web_api import app as app_module from repo_registry.web_api.app import Settings, app, get_service, get_settings +def add_candidate_capability(database_path, repository_id, analysis_run_id, name): + with sqlite3.connect(database_path) as connection: + ability_id = connection.execute( + """ + SELECT id FROM candidate_abilities + WHERE repository_id = ? AND analysis_run_id = ? + ORDER BY id + LIMIT 1 + """, + (repository_id, analysis_run_id), + ).fetchone()[0] + cursor = connection.execute( + """ + INSERT INTO candidate_capabilities + (repository_id, analysis_run_id, ability_id, name, description, + inputs, outputs, primary_class, attributes, confidence, source_refs) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + ( + repository_id, + analysis_run_id, + ability_id, + name, + "Review target capability inserted for API review workflow tests.", + "[]", + "[]", + "test-capability", + json.dumps(["test-review-target"]), + 0.5, + "[]", + ), + ) + return int(cursor.lastrowid) + + def test_openapi_groups_agent_facing_endpoints(): client = TestClient(app) @@ -875,10 +913,11 @@ def test_api_analysis_run_loop(tmp_path): '{"dependencies":{"react":"latest","vite":"latest"}}', encoding="utf-8", ) + database_path = str(tmp_path / "api-analysis.sqlite3") def override_settings(): return Settings( - database_path=str(tmp_path / "api-analysis.sqlite3"), + database_path=database_path, checkout_root=str(tmp_path / "api-checkouts"), ) @@ -903,6 +942,12 @@ def test_api_analysis_run_loop(tmp_path): assert get_run_response.status_code == 200 assert get_run_response.json()["id"] == run["analysis_run"]["id"] + add_candidate_capability( + database_path, + repository_id, + run["analysis_run"]["id"], + "Describe Frontend Stack", + ) candidate_response = client.get( f"/repos/{repository_id}/analysis-runs/" f"{run['analysis_run']['id']}/candidate-graph" @@ -954,6 +999,12 @@ def test_api_analysis_run_loop(tmp_path): run_response = client.post(f"/repos/{repository_id}/analysis-runs", json={}) assert run_response.status_code == 201 run = run_response.json() + add_candidate_capability( + database_path, + repository_id, + run["analysis_run"]["id"], + "Describe Frontend Stack", + ) candidate_response = client.get( f"/repos/{repository_id}/analysis-runs/" f"{run['analysis_run']['id']}/candidate-graph" @@ -1358,7 +1409,7 @@ def test_ui_register_analyze_and_approve_loop(tmp_path): assert "Analysis completed with reviewable results." in run_detail.text assert "Candidate Graph" in run_detail.text assert "1 abilities" in run_detail.text - assert "2 capabilities" in run_detail.text + assert "1 capabilities" in run_detail.text assert "2 features" in run_detail.text assert "8 facts" in run_detail.text assert "Content Chunks" in run_detail.text @@ -1426,11 +1477,11 @@ def test_ui_register_analyze_and_approve_loop(tmp_path): assert "1 scope" in approved_detail.text assert "supports" in approved_detail.text assert "1 abilities" in approved_detail.text - assert "2 capabilities" in approved_detail.text + assert "1 capabilities" in approved_detail.text assert "2 features" in approved_detail.text assert "Latest Candidate Graph" in approved_detail.text assert "1 candidate abilities" in approved_detail.text - assert "2 candidate capabilities" in approved_detail.text + assert "1 candidate capabilities" in approved_detail.text assert "2 candidate features" in approved_detail.text assert "8 candidate facts" in approved_detail.text assert "Use Approved Registry" in approved_detail.text @@ -1801,6 +1852,44 @@ def test_ui_analysis_run_diagnostics_explain_failures_and_empty_results(tmp_path app.dependency_overrides.clear() +def test_ui_analysis_run_diagnostics_warn_when_only_baseline_context_exists(tmp_path): + source = tmp_path / "dependency-only-ui" + source.mkdir() + (source / "README.md").write_text("# Dependency Only\nUses libraries.\n", encoding="utf-8") + (source / "requirements.txt").write_text("fastapi\npytest\n", encoding="utf-8") + + def override_settings(): + return Settings( + database_path=str(tmp_path / "ui-baseline-diagnostics.sqlite3"), + checkout_root=str(tmp_path / "ui-baseline-diagnostics-checkouts"), + ) + + app.dependency_overrides[get_settings] = override_settings + client = TestClient(app) + try: + repository = client.post( + "/repos", + json={ + "url": str(source), + "name": "Dependency Only UI", + "description": "Used for baseline diagnostics.", + }, + ).json() + run = client.post( + f"/ui/repos/{repository['id']}/analysis-runs", + data={"source_path": "", "use_llm_assistance": ""}, + follow_redirects=False, + ) + + detail = client.get(run.headers["location"]) + + assert detail.status_code == 200 + assert "No domain capabilities were produced." in detail.text + assert "only baseline context or weak documentation was available" in detail.text + finally: + app.dependency_overrides.clear() + + def test_ui_register_and_explore_lands_on_analysis_result(tmp_path): source = tmp_path / "explore-repo" source.mkdir() @@ -2550,9 +2639,11 @@ def test_api_relinks_candidate_feature_and_evidence(tmp_path): " return {}\n", encoding="utf-8", ) + database_path = str(tmp_path / "api-relink.sqlite3") + def override_settings(): return Settings( - database_path=str(tmp_path / "api-relink.sqlite3"), + database_path=database_path, checkout_root=str(tmp_path / "api-relink-checkouts"), ) @@ -2566,6 +2657,12 @@ def test_api_relinks_candidate_feature_and_evidence(tmp_path): repository_id = repository_response.json()["id"] run_response = client.post(f"/repos/{repository_id}/analysis-runs", json={}) run_id = run_response.json()["analysis_run"]["id"] + add_candidate_capability( + database_path, + repository_id, + run_id, + "Review Target Capability", + ) graph_response = client.get( f"/repos/{repository_id}/analysis-runs/{run_id}/candidate-graph" ) @@ -2631,10 +2728,11 @@ def test_api_merges_candidate_capability_feature_and_evidence(tmp_path): " click.echo('ok')\n", encoding="utf-8", ) + database_path = str(tmp_path / "api-merge.sqlite3") def override_settings(): return Settings( - database_path=str(tmp_path / "api-merge.sqlite3"), + database_path=database_path, checkout_root=str(tmp_path / "api-merge-checkouts"), ) @@ -2648,6 +2746,12 @@ def test_api_merges_candidate_capability_feature_and_evidence(tmp_path): repository_id = repository_response.json()["id"] run_response = client.post(f"/repos/{repository_id}/analysis-runs", json={}) run_id = run_response.json()["analysis_run"]["id"] + add_candidate_capability( + database_path, + repository_id, + run_id, + "Review Target Capability", + ) graph_response = client.get( f"/repos/{repository_id}/analysis-runs/{run_id}/candidate-graph" )