Repo stats and features as aggregates

2026-04-28 03:01:10 +02:00
parent c0a044fa0b
commit 2313e8675e
10 changed files with 258 additions and 24 deletions
--- a/src/repo_registry/candidate_graph/generator.py
+++ b/src/repo_registry/candidate_graph/generator.py
@@ -125,16 +125,7 @@ class CandidateGraphGenerator:
        docs: list[ObservedFact],
        chunks: list[ContentChunk],
    ) -> CandidateCapabilityDraft:
-        features = [
-            CandidateFeatureDraft(
-                name=self._feature_name(fact, chunks),
-                type=self._feature_type(fact),
-                location=fact.path,
-                confidence=0.65 if fact.value else 0.45,
-                source_refs=self._source_refs([fact]),
-            )
-            for fact in interfaces
-        ]
+        features = self._interface_features(interfaces, chunks)
        return CandidateCapabilityDraft(
            name="Expose Repository Interface",
            description=self._interface_description(chunks),
@@ -151,6 +142,83 @@ class CandidateGraphGenerator:
            evidence=self._evidence(tests, examples, docs),
        )

+    def _interface_features(
+        self,
+        interfaces: list[ObservedFact],
+        chunks: list[ContentChunk],
+    ) -> list[CandidateFeatureDraft]:
+        by_type: dict[str, list[ObservedFact]] = {}
+        for fact in interfaces:
+            by_type.setdefault(self._feature_type(fact), []).append(fact)
+
+        features: list[CandidateFeatureDraft] = []
+        for feature_type, facts in sorted(by_type.items()):
+            if len(facts) == 1:
+                fact = facts[0]
+                features.append(
+                    CandidateFeatureDraft(
+                        name=self._feature_name(fact, chunks),
+                        type=feature_type,
+                        location=fact.path,
+                        confidence=0.65 if fact.value else 0.45,
+                        source_refs=self._source_refs([fact]),
+                    )
+                )
+                continue
+
+            features.append(
+                CandidateFeatureDraft(
+                    name=self._grouped_interface_feature_name(
+                        feature_type,
+                        facts,
+                        chunks,
+                    ),
+                    type=feature_type,
+                    location=self._grouped_location(facts),
+                    confidence=self._grouped_interface_confidence(facts),
+                    source_refs=self._source_refs(facts),
+                )
+            )
+        return features
+
+    def _grouped_interface_feature_name(
+        self,
+        feature_type: str,
+        facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+    ) -> str:
+        summary = self._grouped_interface_summary(facts, chunks)
+        if feature_type == "API":
+            return f"HTTP API surface: {summary}"
+        if feature_type == "CLI":
+            return f"CLI command surface: {summary}"
+        return f"Callable interface surface: {summary}"
+
+    def _grouped_interface_summary(
+        self,
+        facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+    ) -> str:
+        names = [self._feature_name(fact, chunks) for fact in facts]
+        compact_names = [name for name in names if name]
+        if not compact_names:
+            return f"{len(facts)} entry points"
+        visible = compact_names[:3]
+        suffix = f", +{len(compact_names) - 3} more" if len(compact_names) > 3 else ""
+        return f"{', '.join(visible)}{suffix}"
+
+    def _grouped_location(self, facts: list[ObservedFact]) -> str:
+        paths = sorted({fact.path for fact in facts if fact.path})
+        if not paths:
+            return ""
+        if len(paths) == 1:
+            return paths[0]
+        return "multiple files"
+
+    def _grouped_interface_confidence(self, facts: list[ObservedFact]) -> float:
+        valued = sum(1 for fact in facts if fact.value)
+        return 0.7 if valued == len(facts) else 0.55
+
    def _evidence(
        self,
        tests: list[ObservedFact],
--- a/src/repo_registry/core/service.py
+++ b/src/repo_registry/core/service.py
@@ -120,6 +120,7 @@ class RegistryService:
        repository_id: int,
        *,
        source_path: str | None = None,
+        use_cached_checkout: bool = False,
        access_username: str | None = None,
        access_password: str | None = None,
    ) -> ScanSummary:
@@ -134,12 +135,20 @@ class RegistryService:
        )
        try:
            if source_path is None:
-                checkout = self.ingestion.resolve(
-                    repository.url,
-                    branch=repository.branch,
-                    access_username=access_username,
-                    access_password=access_password,
-                )
+                if use_cached_checkout:
+                    checkout = self.ingestion.cached_checkout(repository.url)
+                    if checkout is None:
+                        raise RuntimeError(
+                            "cached checkout was requested, but no checkout exists "
+                            "for this repository"
+                        )
+                else:
+                    checkout = self.ingestion.resolve(
+                        repository.url,
+                        branch=repository.branch,
+                        access_username=access_username,
+                        access_password=access_password,
+                    )
                scan_source = checkout.source_path
            else:
                scan_source = source_path
--- a/src/repo_registry/repo_ingestion/git.py
+++ b/src/repo_registry/repo_ingestion/git.py
@@ -57,6 +57,16 @@ class GitIngestionService:
        )
        return Checkout(source_path=checkout_path.resolve(), was_cloned=True)

+    def cached_checkout(self, url_or_path: str) -> Checkout | None:
+        local_path = self._local_path(url_or_path)
+        if local_path is not None:
+            return Checkout(source_path=local_path.resolve(), was_cloned=False)
+
+        checkout_path = self.checkout_root / self._checkout_key(url_or_path)
+        if not checkout_path.exists():
+            return None
+        return Checkout(source_path=checkout_path.resolve(), was_cloned=True)
+
    def _checkout_branch(
        self,
        checkout_path: Path,
--- a/src/repo_registry/web_api/app.py
+++ b/src/repo_registry/web_api/app.py
@@ -245,6 +245,7 @@ def create_analysis_run(
        summary = service.analyze_repository(
            repository_id,
            source_path=payload.source_path,
+            use_cached_checkout=payload.use_cached_checkout,
            access_username=payload.access_username,
            access_password=payload.access_password,
        )
--- a/src/repo_registry/web_api/schemas.py
+++ b/src/repo_registry/web_api/schemas.py
@@ -202,6 +202,7 @@ class EvidenceUpdate(BaseModel):

 class AnalysisRunCreate(BaseModel):
    source_path: str | None = None
+    use_cached_checkout: bool = False
    access_username: str | None = None
    access_password: str | None = Field(default=None, repr=False)

@@ -210,6 +211,7 @@ class AnalysisRunCreate(BaseModel):
            "examples": [
                {},
                {"source_path": "/path/to/local/repository"},
+                {"use_cached_checkout": True},
                {
                    "access_username": "git-user",
                    "access_password": "access-token",
--- a/src/repo_registry/web_ui/views.py
+++ b/src/repo_registry/web_ui/views.py
@@ -501,6 +501,7 @@ def repository_detail(
        <h2>Run Analysis</h2>
        <form class="stack" method="post" action="/ui/repos/{repository_id}/analysis-runs">
          <label>Override source path <input name="source_path" placeholder="Optional local path"></label>
+          <label class="checkbox"><input type="checkbox" name="use_cached_checkout" value="1"> Analyze cached checkout without fetching upstream</label>
          <label>Username <input name="access_username" autocomplete="username" placeholder="Optional for private HTTP(S) repos"></label>
          <label>Password or access token <input name="access_password" type="password" autocomplete="current-password" placeholder="Used for this Git operation only"></label>
          <div class="actions">
@@ -516,6 +517,7 @@ def repository_detail(
      </section>
      <section class="panel">
        <h2>Approved Ability Map</h2>
+        {render_graph_counts(asdict(ability_map), facts_count=None)}
        {render_ability_map(asdict(ability_map), repository_id)}
      </section>
    </div>
@@ -821,6 +823,7 @@ def delete_evidence_from_form(
 def create_analysis_run_from_form(
    repository_id: int,
    source_path: str = Form(""),
+    use_cached_checkout: str | None = Form(None),
    access_username: str = Form(""),
    access_password: str = Form(""),
    service: RegistryService = Depends(get_service),
@@ -828,6 +831,7 @@ def create_analysis_run_from_form(
    summary = service.analyze_repository(
        repository_id,
        source_path=source_path or None,
+        use_cached_checkout=bool(use_cached_checkout),
        access_username=access_username or None,
        access_password=access_password or None,
    )
@@ -869,6 +873,7 @@ def analysis_run_detail(
      <section class="panel">
        <div class="actions">
          <h2 style="margin-right:auto">Candidate Graph</h2>
+          {render_graph_counts(asdict(candidate_graph), facts_count=len(facts))}
          <form method="post" action="/ui/repos/{repository_id}/analysis-runs/{analysis_run_id}/candidate-graph/approve">
            <button type="submit">Approve</button>
          </form>
@@ -876,7 +881,10 @@ def analysis_run_detail(
        {render_candidate_graph(asdict(candidate_graph), repository_id, analysis_run_id)}
      </section>
      <section class="panel">
-        <h2>Observed Facts</h2>
+        <div class="actions">
+          <h2 style="margin-right:auto">Observed Facts</h2>
+          {render_count_pills(facts=len(facts))}
+        </div>
        <table>
          <thead><tr><th>Kind</th><th>Name</th><th>Path</th><th>Value</th></tr></thead>
          <tbody>{fact_rows or '<tr><td colspan="4" class="muted">No observed facts.</td></tr>'}</tbody>
@@ -1565,6 +1573,41 @@ def split_capability_lines(value: str) -> list[str]:
    return [line.strip() for line in normalized.splitlines() if line.strip()]


+def render_graph_counts(graph: dict, facts_count: int | None = None) -> str:
+    abilities = graph.get("abilities", [])
+    capabilities = [
+        capability
+        for ability in abilities
+        for capability in ability.get("capabilities", [])
+    ]
+    features = [
+        feature
+        for capability in capabilities
+        for feature in capability.get("features", [])
+    ]
+    counts: dict[str, int] = {
+        "abilities": len(abilities),
+        "capabilities": len(capabilities),
+        "features": len(features),
+    }
+    if facts_count is not None:
+        counts["facts"] = facts_count
+    return render_count_pills(**counts)
+
+
+def render_count_pills(**counts: int) -> str:
+    labels = {
+        "abilities": "abilities",
+        "capabilities": "capabilities",
+        "features": "features",
+        "facts": "facts",
+    }
+    return "".join(
+        f'<span class="pill">{count} {labels[name]}</span>'
+        for name, count in counts.items()
+    )
+
+
 def render_candidate_graph(graph: dict, repository_id: int, analysis_run_id: int) -> str:
    abilities = graph.get("abilities", [])
    if not abilities:
--- a/tests/test_candidate_graph.py
+++ b/tests/test_candidate_graph.py
@@ -208,3 +208,39 @@ def test_candidate_generator_uses_generic_io_for_unknown_interfaces():
    capability = graph[0].capabilities[0]
    assert capability.inputs == ["caller input"]
    assert capability.outputs == ["callable interface result"]
+
+
+def test_candidate_generator_groups_many_interface_facts_into_behavioral_features():
+    repository = Repository(
+        id=1,
+        name="Registry",
+        url="/tmp/registry",
+        description=None,
+        branch="main",
+        status="analyzed",
+    )
+    facts = [
+        fact(1, "documentation", "README", "README.md"),
+        fact(2, "interface", "python route decorator", "src/api.py", '@app.get("/repos")'),
+        fact(3, "interface", "python route decorator", "src/api.py", '@app.post("/repos")'),
+        fact(
+            4,
+            "interface",
+            "python route decorator",
+            "src/api.py",
+            '@app.post("/repos/{repository_id}/analysis-runs")',
+        ),
+        fact(5, "test", "test_api.py", "tests/test_api.py"),
+    ]
+
+    graph = CandidateGraphGenerator().generate(repository, facts)
+
+    capability = graph[0].capabilities[0]
+    assert len(capability.features) == 1
+    feature = capability.features[0]
+    assert feature.name == (
+        "HTTP API surface: GET /repos, POST /repos, POST /repos/{repository_id}/analysis-runs"
+    )
+    assert feature.type == "API"
+    assert feature.location == "src/api.py"
+    assert len(feature.source_refs) == 3
--- a/tests/test_registry_service.py
+++ b/tests/test_registry_service.py
@@ -267,7 +267,6 @@ def test_search_filters_by_status_language_and_framework(tmp_path):
        "    return {}\n",
        encoding="utf-8",
    )
-
    service = make_service(tmp_path)
    repository = service.register_repository(name="Filterable", url=str(source))
    summary = service.analyze_repository(repository.id)
@@ -327,7 +326,7 @@ def test_fixture_breadth_python_cli_repo_extracts_reviewable_cli_claims(tmp_path
    assert summary.analysis_run.status == "completed"
    assert capability.name == "Expose Repository Interface"
    assert capability.features[0].type == "CLI"
-    assert capability.features[0].name == "CLI command main"
+    assert capability.features[0].name.startswith("CLI command surface:")
    assert capability.evidence[0].reference == "tests/test_cli.py"
    assert service.ability_map(repository.id).abilities == []

@@ -523,7 +522,6 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
        "    return {}\n",
        encoding="utf-8",
    )
-
    service = make_service(tmp_path)
    repository = service.register_repository(
        name="Example",
@@ -660,6 +658,13 @@ def test_approve_candidate_graph_publishes_ability_map_once(tmp_path):
        "    return {}\n",
        encoding="utf-8",
    )
+    (source / "cli.py").write_text(
+        "import click\n\n"
+        "@click.command()\n"
+        "def health():\n"
+        "    click.echo('ok')\n",
+        encoding="utf-8",
+    )

    service = make_service(tmp_path)
    repository = service.register_repository(name="Example", url=str(source))
@@ -1150,6 +1155,13 @@ def test_merge_candidate_feature_and_evidence_omits_duplicate_leaves(tmp_path):
        "    return {}\n",
        encoding="utf-8",
    )
+    (source / "cli.py").write_text(
+        "import click\n\n"
+        "@click.command()\n"
+        "def health():\n"
+        "    click.echo('ok')\n",
+        encoding="utf-8",
+    )

    service = make_service(tmp_path)
    repository = service.register_repository(name="Merge Leaves", url=str(source))
@@ -1224,6 +1236,36 @@ def test_analyze_repository_clones_git_url_before_scanning(tmp_path):
    assert ("framework", "pytest", "requirements.txt") in fact_names


+def test_analyze_repository_can_use_cached_checkout_without_fetching(tmp_path, monkeypatch):
+    service = make_service(tmp_path)
+    url = "https://example.com/private/repo.git"
+    cached = tmp_path / "checkouts" / "repo-b5d250ec3c59"
+    cached.mkdir(parents=True)
+    (cached / "README.md").write_text("# Cached Repo\n", encoding="utf-8")
+
+    def fail_run_git(*args, **kwargs):
+        raise AssertionError("cached analysis should not run git")
+
+    monkeypatch.setattr(service.ingestion, "_run_git", fail_run_git)
+    repository = service.register_repository(
+        name="Cached",
+        url=url,
+        description="Already cloned.",
+    )
+
+    summary = service.analyze_repository(
+        repository.id,
+        use_cached_checkout=True,
+    )
+
+    assert summary.analysis_run.status == "completed"
+    assert summary.snapshot is not None
+    assert str(cached) == summary.snapshot.source_path
+    assert ("documentation", "README", "README.md") in {
+        (fact.kind, fact.name, fact.path) for fact in summary.facts
+    }
+
+
 def test_operational_logging_records_analysis_and_review_events(caplog, tmp_path):
    source = tmp_path / "repo"
    source.mkdir()
--- a/tests/test_web_api.py
+++ b/tests/test_web_api.py
@@ -1031,7 +1031,10 @@ def test_api_source_linked_candidate_and_repo_update_loop(tmp_path):
            for capability in ability["capabilities"]
            for feature in capability["features"]
        }
-        assert {"GET /status", "GET /ready"} <= second_features
+        assert any(
+            "GET /status" in feature_name and "GET /ready" in feature_name
+            for feature_name in second_features
+        )

        approved_after_reanalysis = client.get(f"/repos/{repository_id}/ability-map")
        assert approved_after_reanalysis.status_code == 200
@@ -1060,6 +1063,13 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
        "    return {}\n",
        encoding="utf-8",
    )
+    (source / "cli.py").write_text(
+        "import click\n\n"
+        "@click.command()\n"
+        "def status():\n"
+        "    click.echo('ok')\n",
+        encoding="utf-8",
+    )

    def override_settings():
        return Settings(
@@ -1095,6 +1105,7 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
        assert detail_response.status_code == 200
        assert "Run Analysis" in detail_response.text
        assert "Running analysis..." in detail_response.text
+        assert "Analyze cached checkout without fetching upstream" in detail_response.text
        assert "Repository Metadata" in detail_response.text

        edit_repository_response = client.post(
@@ -1127,6 +1138,10 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
        run_detail = client.get(run_path)
        assert run_detail.status_code == 200
        assert "Candidate Graph" in run_detail.text
+        assert "1 abilities" in run_detail.text
+        assert "2 capabilities" in run_detail.text
+        assert "2 features" in run_detail.text
+        assert "7 facts" in run_detail.text
        assert "Content Chunks" in run_detail.text
        assert "README.md:1-1" in run_detail.text
        assert "ID " in run_detail.text
@@ -1141,6 +1156,9 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
        approved_detail = client.get(approve_response.headers["location"])
        assert approved_detail.status_code == 200
        assert "Approved Ability Map" in approved_detail.text
+        assert "1 abilities" in approved_detail.text
+        assert "2 capabilities" in approved_detail.text
+        assert "2 features" in approved_detail.text
        assert "Review UI Repo Edited Repository Usefulness" in approved_detail.text
        assert "Language: Python" in approved_detail.text
        assert "Framework: FastAPI" in approved_detail.text
@@ -1635,7 +1653,6 @@ def test_api_rejects_candidate_capability_feature_and_evidence(tmp_path):
        "    return {}\n",
        encoding="utf-8",
    )
-
    def override_settings():
        return Settings(
            database_path=str(tmp_path / "api-reject.sqlite3"),
@@ -1723,7 +1740,6 @@ def test_api_relinks_candidate_feature_and_evidence(tmp_path):
        "    return {}\n",
        encoding="utf-8",
    )
-
    def override_settings():
        return Settings(
            database_path=str(tmp_path / "api-relink.sqlite3"),
@@ -1798,6 +1814,13 @@ def test_api_merges_candidate_capability_feature_and_evidence(tmp_path):
        "    return {}\n",
        encoding="utf-8",
    )
+    (source / "cli.py").write_text(
+        "import click\n\n"
+        "@click.command()\n"
+        "def status_cli():\n"
+        "    click.echo('ok')\n",
+        encoding="utf-8",
+    )

    def override_settings():
        return Settings(
--- a/workplans/RREG-WP-0003-automatic-repository-exploration.md
+++ b/workplans/RREG-WP-0003-automatic-repository-exploration.md
@@ -45,7 +45,7 @@ analysis run in one flow and land on the reviewable candidate graph.

 ```task
 id: RREG-WP-0003-T02
-status: todo
+status: done
 priority: high
 state_hub_task_id: "d0d98e1b-8d21-4bdf-af58-edbb34e8a929"
 ```