From 2313e8675e6e9198362032171dc554e6f8c5a852 Mon Sep 17 00:00:00 2001 From: tegwick Date: Tue, 28 Apr 2026 03:01:10 +0200 Subject: [PATCH] Repo stats and features as aggregates --- .../candidate_graph/generator.py | 88 ++++++++++++++++--- src/repo_registry/core/service.py | 21 +++-- src/repo_registry/repo_ingestion/git.py | 10 +++ src/repo_registry/web_api/app.py | 1 + src/repo_registry/web_api/schemas.py | 2 + src/repo_registry/web_ui/views.py | 45 +++++++++- tests/test_candidate_graph.py | 36 ++++++++ tests/test_registry_service.py | 48 +++++++++- tests/test_web_api.py | 29 +++++- ...P-0003-automatic-repository-exploration.md | 2 +- 10 files changed, 258 insertions(+), 24 deletions(-) diff --git a/src/repo_registry/candidate_graph/generator.py b/src/repo_registry/candidate_graph/generator.py index af4da2b..04399fd 100644 --- a/src/repo_registry/candidate_graph/generator.py +++ b/src/repo_registry/candidate_graph/generator.py @@ -125,16 +125,7 @@ class CandidateGraphGenerator: docs: list[ObservedFact], chunks: list[ContentChunk], ) -> CandidateCapabilityDraft: - features = [ - CandidateFeatureDraft( - name=self._feature_name(fact, chunks), - type=self._feature_type(fact), - location=fact.path, - confidence=0.65 if fact.value else 0.45, - source_refs=self._source_refs([fact]), - ) - for fact in interfaces - ] + features = self._interface_features(interfaces, chunks) return CandidateCapabilityDraft( name="Expose Repository Interface", description=self._interface_description(chunks), @@ -151,6 +142,83 @@ class CandidateGraphGenerator: evidence=self._evidence(tests, examples, docs), ) + def _interface_features( + self, + interfaces: list[ObservedFact], + chunks: list[ContentChunk], + ) -> list[CandidateFeatureDraft]: + by_type: dict[str, list[ObservedFact]] = {} + for fact in interfaces: + by_type.setdefault(self._feature_type(fact), []).append(fact) + + features: list[CandidateFeatureDraft] = [] + for feature_type, facts in sorted(by_type.items()): + if len(facts) == 1: + fact = facts[0] + features.append( + CandidateFeatureDraft( + name=self._feature_name(fact, chunks), + type=feature_type, + location=fact.path, + confidence=0.65 if fact.value else 0.45, + source_refs=self._source_refs([fact]), + ) + ) + continue + + features.append( + CandidateFeatureDraft( + name=self._grouped_interface_feature_name( + feature_type, + facts, + chunks, + ), + type=feature_type, + location=self._grouped_location(facts), + confidence=self._grouped_interface_confidence(facts), + source_refs=self._source_refs(facts), + ) + ) + return features + + def _grouped_interface_feature_name( + self, + feature_type: str, + facts: list[ObservedFact], + chunks: list[ContentChunk], + ) -> str: + summary = self._grouped_interface_summary(facts, chunks) + if feature_type == "API": + return f"HTTP API surface: {summary}" + if feature_type == "CLI": + return f"CLI command surface: {summary}" + return f"Callable interface surface: {summary}" + + def _grouped_interface_summary( + self, + facts: list[ObservedFact], + chunks: list[ContentChunk], + ) -> str: + names = [self._feature_name(fact, chunks) for fact in facts] + compact_names = [name for name in names if name] + if not compact_names: + return f"{len(facts)} entry points" + visible = compact_names[:3] + suffix = f", +{len(compact_names) - 3} more" if len(compact_names) > 3 else "" + return f"{', '.join(visible)}{suffix}" + + def _grouped_location(self, facts: list[ObservedFact]) -> str: + paths = sorted({fact.path for fact in facts if fact.path}) + if not paths: + return "" + if len(paths) == 1: + return paths[0] + return "multiple files" + + def _grouped_interface_confidence(self, facts: list[ObservedFact]) -> float: + valued = sum(1 for fact in facts if fact.value) + return 0.7 if valued == len(facts) else 0.55 + def _evidence( self, tests: list[ObservedFact], diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 2b4c955..e86525e 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -120,6 +120,7 @@ class RegistryService: repository_id: int, *, source_path: str | None = None, + use_cached_checkout: bool = False, access_username: str | None = None, access_password: str | None = None, ) -> ScanSummary: @@ -134,12 +135,20 @@ class RegistryService: ) try: if source_path is None: - checkout = self.ingestion.resolve( - repository.url, - branch=repository.branch, - access_username=access_username, - access_password=access_password, - ) + if use_cached_checkout: + checkout = self.ingestion.cached_checkout(repository.url) + if checkout is None: + raise RuntimeError( + "cached checkout was requested, but no checkout exists " + "for this repository" + ) + else: + checkout = self.ingestion.resolve( + repository.url, + branch=repository.branch, + access_username=access_username, + access_password=access_password, + ) scan_source = checkout.source_path else: scan_source = source_path diff --git a/src/repo_registry/repo_ingestion/git.py b/src/repo_registry/repo_ingestion/git.py index d7b7fd5..374e3a7 100644 --- a/src/repo_registry/repo_ingestion/git.py +++ b/src/repo_registry/repo_ingestion/git.py @@ -57,6 +57,16 @@ class GitIngestionService: ) return Checkout(source_path=checkout_path.resolve(), was_cloned=True) + def cached_checkout(self, url_or_path: str) -> Checkout | None: + local_path = self._local_path(url_or_path) + if local_path is not None: + return Checkout(source_path=local_path.resolve(), was_cloned=False) + + checkout_path = self.checkout_root / self._checkout_key(url_or_path) + if not checkout_path.exists(): + return None + return Checkout(source_path=checkout_path.resolve(), was_cloned=True) + def _checkout_branch( self, checkout_path: Path, diff --git a/src/repo_registry/web_api/app.py b/src/repo_registry/web_api/app.py index 0908c79..448b14c 100644 --- a/src/repo_registry/web_api/app.py +++ b/src/repo_registry/web_api/app.py @@ -245,6 +245,7 @@ def create_analysis_run( summary = service.analyze_repository( repository_id, source_path=payload.source_path, + use_cached_checkout=payload.use_cached_checkout, access_username=payload.access_username, access_password=payload.access_password, ) diff --git a/src/repo_registry/web_api/schemas.py b/src/repo_registry/web_api/schemas.py index 03d65c1..4800ba3 100644 --- a/src/repo_registry/web_api/schemas.py +++ b/src/repo_registry/web_api/schemas.py @@ -202,6 +202,7 @@ class EvidenceUpdate(BaseModel): class AnalysisRunCreate(BaseModel): source_path: str | None = None + use_cached_checkout: bool = False access_username: str | None = None access_password: str | None = Field(default=None, repr=False) @@ -210,6 +211,7 @@ class AnalysisRunCreate(BaseModel): "examples": [ {}, {"source_path": "/path/to/local/repository"}, + {"use_cached_checkout": True}, { "access_username": "git-user", "access_password": "access-token", diff --git a/src/repo_registry/web_ui/views.py b/src/repo_registry/web_ui/views.py index 1e381e8..42fe8e8 100644 --- a/src/repo_registry/web_ui/views.py +++ b/src/repo_registry/web_ui/views.py @@ -501,6 +501,7 @@ def repository_detail(

Run Analysis

+
@@ -516,6 +517,7 @@ def repository_detail(

Approved Ability Map

+ {render_graph_counts(asdict(ability_map), facts_count=None)} {render_ability_map(asdict(ability_map), repository_id)}
@@ -821,6 +823,7 @@ def delete_evidence_from_form( def create_analysis_run_from_form( repository_id: int, source_path: str = Form(""), + use_cached_checkout: str | None = Form(None), access_username: str = Form(""), access_password: str = Form(""), service: RegistryService = Depends(get_service), @@ -828,6 +831,7 @@ def create_analysis_run_from_form( summary = service.analyze_repository( repository_id, source_path=source_path or None, + use_cached_checkout=bool(use_cached_checkout), access_username=access_username or None, access_password=access_password or None, ) @@ -869,6 +873,7 @@ def analysis_run_detail(

Candidate Graph

+ {render_graph_counts(asdict(candidate_graph), facts_count=len(facts))} @@ -876,7 +881,10 @@ def analysis_run_detail( {render_candidate_graph(asdict(candidate_graph), repository_id, analysis_run_id)}
-

Observed Facts

+
+

Observed Facts

+ {render_count_pills(facts=len(facts))} +
{fact_rows or ''} @@ -1565,6 +1573,41 @@ def split_capability_lines(value: str) -> list[str]: return [line.strip() for line in normalized.splitlines() if line.strip()] +def render_graph_counts(graph: dict, facts_count: int | None = None) -> str: + abilities = graph.get("abilities", []) + capabilities = [ + capability + for ability in abilities + for capability in ability.get("capabilities", []) + ] + features = [ + feature + for capability in capabilities + for feature in capability.get("features", []) + ] + counts: dict[str, int] = { + "abilities": len(abilities), + "capabilities": len(capabilities), + "features": len(features), + } + if facts_count is not None: + counts["facts"] = facts_count + return render_count_pills(**counts) + + +def render_count_pills(**counts: int) -> str: + labels = { + "abilities": "abilities", + "capabilities": "capabilities", + "features": "features", + "facts": "facts", + } + return "".join( + f'{count} {labels[name]}' + for name, count in counts.items() + ) + + def render_candidate_graph(graph: dict, repository_id: int, analysis_run_id: int) -> str: abilities = graph.get("abilities", []) if not abilities: diff --git a/tests/test_candidate_graph.py b/tests/test_candidate_graph.py index a61a48f..e1ee493 100644 --- a/tests/test_candidate_graph.py +++ b/tests/test_candidate_graph.py @@ -208,3 +208,39 @@ def test_candidate_generator_uses_generic_io_for_unknown_interfaces(): capability = graph[0].capabilities[0] assert capability.inputs == ["caller input"] assert capability.outputs == ["callable interface result"] + + +def test_candidate_generator_groups_many_interface_facts_into_behavioral_features(): + repository = Repository( + id=1, + name="Registry", + url="/tmp/registry", + description=None, + branch="main", + status="analyzed", + ) + facts = [ + fact(1, "documentation", "README", "README.md"), + fact(2, "interface", "python route decorator", "src/api.py", '@app.get("/repos")'), + fact(3, "interface", "python route decorator", "src/api.py", '@app.post("/repos")'), + fact( + 4, + "interface", + "python route decorator", + "src/api.py", + '@app.post("/repos/{repository_id}/analysis-runs")', + ), + fact(5, "test", "test_api.py", "tests/test_api.py"), + ] + + graph = CandidateGraphGenerator().generate(repository, facts) + + capability = graph[0].capabilities[0] + assert len(capability.features) == 1 + feature = capability.features[0] + assert feature.name == ( + "HTTP API surface: GET /repos, POST /repos, POST /repos/{repository_id}/analysis-runs" + ) + assert feature.type == "API" + assert feature.location == "src/api.py" + assert len(feature.source_refs) == 3 diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index b400faa..e37e658 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -267,7 +267,6 @@ def test_search_filters_by_status_language_and_framework(tmp_path): " return {}\n", encoding="utf-8", ) - service = make_service(tmp_path) repository = service.register_repository(name="Filterable", url=str(source)) summary = service.analyze_repository(repository.id) @@ -327,7 +326,7 @@ def test_fixture_breadth_python_cli_repo_extracts_reviewable_cli_claims(tmp_path assert summary.analysis_run.status == "completed" assert capability.name == "Expose Repository Interface" assert capability.features[0].type == "CLI" - assert capability.features[0].name == "CLI command main" + assert capability.features[0].name.startswith("CLI command surface:") assert capability.evidence[0].reference == "tests/test_cli.py" assert service.ability_map(repository.id).abilities == [] @@ -523,7 +522,6 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path): " return {}\n", encoding="utf-8", ) - service = make_service(tmp_path) repository = service.register_repository( name="Example", @@ -660,6 +658,13 @@ def test_approve_candidate_graph_publishes_ability_map_once(tmp_path): " return {}\n", encoding="utf-8", ) + (source / "cli.py").write_text( + "import click\n\n" + "@click.command()\n" + "def health():\n" + " click.echo('ok')\n", + encoding="utf-8", + ) service = make_service(tmp_path) repository = service.register_repository(name="Example", url=str(source)) @@ -1150,6 +1155,13 @@ def test_merge_candidate_feature_and_evidence_omits_duplicate_leaves(tmp_path): " return {}\n", encoding="utf-8", ) + (source / "cli.py").write_text( + "import click\n\n" + "@click.command()\n" + "def health():\n" + " click.echo('ok')\n", + encoding="utf-8", + ) service = make_service(tmp_path) repository = service.register_repository(name="Merge Leaves", url=str(source)) @@ -1224,6 +1236,36 @@ def test_analyze_repository_clones_git_url_before_scanning(tmp_path): assert ("framework", "pytest", "requirements.txt") in fact_names +def test_analyze_repository_can_use_cached_checkout_without_fetching(tmp_path, monkeypatch): + service = make_service(tmp_path) + url = "https://example.com/private/repo.git" + cached = tmp_path / "checkouts" / "repo-b5d250ec3c59" + cached.mkdir(parents=True) + (cached / "README.md").write_text("# Cached Repo\n", encoding="utf-8") + + def fail_run_git(*args, **kwargs): + raise AssertionError("cached analysis should not run git") + + monkeypatch.setattr(service.ingestion, "_run_git", fail_run_git) + repository = service.register_repository( + name="Cached", + url=url, + description="Already cloned.", + ) + + summary = service.analyze_repository( + repository.id, + use_cached_checkout=True, + ) + + assert summary.analysis_run.status == "completed" + assert summary.snapshot is not None + assert str(cached) == summary.snapshot.source_path + assert ("documentation", "README", "README.md") in { + (fact.kind, fact.name, fact.path) for fact in summary.facts + } + + def test_operational_logging_records_analysis_and_review_events(caplog, tmp_path): source = tmp_path / "repo" source.mkdir() diff --git a/tests/test_web_api.py b/tests/test_web_api.py index a2f276e..21a6d78 100644 --- a/tests/test_web_api.py +++ b/tests/test_web_api.py @@ -1031,7 +1031,10 @@ def test_api_source_linked_candidate_and_repo_update_loop(tmp_path): for capability in ability["capabilities"] for feature in capability["features"] } - assert {"GET /status", "GET /ready"} <= second_features + assert any( + "GET /status" in feature_name and "GET /ready" in feature_name + for feature_name in second_features + ) approved_after_reanalysis = client.get(f"/repos/{repository_id}/ability-map") assert approved_after_reanalysis.status_code == 200 @@ -1060,6 +1063,13 @@ def test_ui_register_analyze_and_approve_loop(tmp_path): " return {}\n", encoding="utf-8", ) + (source / "cli.py").write_text( + "import click\n\n" + "@click.command()\n" + "def status():\n" + " click.echo('ok')\n", + encoding="utf-8", + ) def override_settings(): return Settings( @@ -1095,6 +1105,7 @@ def test_ui_register_analyze_and_approve_loop(tmp_path): assert detail_response.status_code == 200 assert "Run Analysis" in detail_response.text assert "Running analysis..." in detail_response.text + assert "Analyze cached checkout without fetching upstream" in detail_response.text assert "Repository Metadata" in detail_response.text edit_repository_response = client.post( @@ -1127,6 +1138,10 @@ def test_ui_register_analyze_and_approve_loop(tmp_path): run_detail = client.get(run_path) assert run_detail.status_code == 200 assert "Candidate Graph" in run_detail.text + assert "1 abilities" in run_detail.text + assert "2 capabilities" in run_detail.text + assert "2 features" in run_detail.text + assert "7 facts" in run_detail.text assert "Content Chunks" in run_detail.text assert "README.md:1-1" in run_detail.text assert "ID " in run_detail.text @@ -1141,6 +1156,9 @@ def test_ui_register_analyze_and_approve_loop(tmp_path): approved_detail = client.get(approve_response.headers["location"]) assert approved_detail.status_code == 200 assert "Approved Ability Map" in approved_detail.text + assert "1 abilities" in approved_detail.text + assert "2 capabilities" in approved_detail.text + assert "2 features" in approved_detail.text assert "Review UI Repo Edited Repository Usefulness" in approved_detail.text assert "Language: Python" in approved_detail.text assert "Framework: FastAPI" in approved_detail.text @@ -1635,7 +1653,6 @@ def test_api_rejects_candidate_capability_feature_and_evidence(tmp_path): " return {}\n", encoding="utf-8", ) - def override_settings(): return Settings( database_path=str(tmp_path / "api-reject.sqlite3"), @@ -1723,7 +1740,6 @@ def test_api_relinks_candidate_feature_and_evidence(tmp_path): " return {}\n", encoding="utf-8", ) - def override_settings(): return Settings( database_path=str(tmp_path / "api-relink.sqlite3"), @@ -1798,6 +1814,13 @@ def test_api_merges_candidate_capability_feature_and_evidence(tmp_path): " return {}\n", encoding="utf-8", ) + (source / "cli.py").write_text( + "import click\n\n" + "@click.command()\n" + "def status_cli():\n" + " click.echo('ok')\n", + encoding="utf-8", + ) def override_settings(): return Settings( diff --git a/workplans/RREG-WP-0003-automatic-repository-exploration.md b/workplans/RREG-WP-0003-automatic-repository-exploration.md index b35bd06..4ded3cf 100644 --- a/workplans/RREG-WP-0003-automatic-repository-exploration.md +++ b/workplans/RREG-WP-0003-automatic-repository-exploration.md @@ -45,7 +45,7 @@ analysis run in one flow and land on the reviewable candidate graph. ```task id: RREG-WP-0003-T02 -status: todo +status: done priority: high state_hub_task_id: "d0d98e1b-8d21-4bdf-af58-edbb34e8a929" ```
KindNamePathValue
No observed facts.