Repo stats and features as aggregates

This commit is contained in:
2026-04-28 03:01:10 +02:00
parent c0a044fa0b
commit 2313e8675e
10 changed files with 258 additions and 24 deletions

View File

@@ -125,16 +125,7 @@ class CandidateGraphGenerator:
docs: list[ObservedFact],
chunks: list[ContentChunk],
) -> CandidateCapabilityDraft:
features = [
CandidateFeatureDraft(
name=self._feature_name(fact, chunks),
type=self._feature_type(fact),
location=fact.path,
confidence=0.65 if fact.value else 0.45,
source_refs=self._source_refs([fact]),
)
for fact in interfaces
]
features = self._interface_features(interfaces, chunks)
return CandidateCapabilityDraft(
name="Expose Repository Interface",
description=self._interface_description(chunks),
@@ -151,6 +142,83 @@ class CandidateGraphGenerator:
evidence=self._evidence(tests, examples, docs),
)
def _interface_features(
self,
interfaces: list[ObservedFact],
chunks: list[ContentChunk],
) -> list[CandidateFeatureDraft]:
by_type: dict[str, list[ObservedFact]] = {}
for fact in interfaces:
by_type.setdefault(self._feature_type(fact), []).append(fact)
features: list[CandidateFeatureDraft] = []
for feature_type, facts in sorted(by_type.items()):
if len(facts) == 1:
fact = facts[0]
features.append(
CandidateFeatureDraft(
name=self._feature_name(fact, chunks),
type=feature_type,
location=fact.path,
confidence=0.65 if fact.value else 0.45,
source_refs=self._source_refs([fact]),
)
)
continue
features.append(
CandidateFeatureDraft(
name=self._grouped_interface_feature_name(
feature_type,
facts,
chunks,
),
type=feature_type,
location=self._grouped_location(facts),
confidence=self._grouped_interface_confidence(facts),
source_refs=self._source_refs(facts),
)
)
return features
def _grouped_interface_feature_name(
self,
feature_type: str,
facts: list[ObservedFact],
chunks: list[ContentChunk],
) -> str:
summary = self._grouped_interface_summary(facts, chunks)
if feature_type == "API":
return f"HTTP API surface: {summary}"
if feature_type == "CLI":
return f"CLI command surface: {summary}"
return f"Callable interface surface: {summary}"
def _grouped_interface_summary(
self,
facts: list[ObservedFact],
chunks: list[ContentChunk],
) -> str:
names = [self._feature_name(fact, chunks) for fact in facts]
compact_names = [name for name in names if name]
if not compact_names:
return f"{len(facts)} entry points"
visible = compact_names[:3]
suffix = f", +{len(compact_names) - 3} more" if len(compact_names) > 3 else ""
return f"{', '.join(visible)}{suffix}"
def _grouped_location(self, facts: list[ObservedFact]) -> str:
paths = sorted({fact.path for fact in facts if fact.path})
if not paths:
return ""
if len(paths) == 1:
return paths[0]
return "multiple files"
def _grouped_interface_confidence(self, facts: list[ObservedFact]) -> float:
valued = sum(1 for fact in facts if fact.value)
return 0.7 if valued == len(facts) else 0.55
def _evidence(
self,
tests: list[ObservedFact],

View File

@@ -120,6 +120,7 @@ class RegistryService:
repository_id: int,
*,
source_path: str | None = None,
use_cached_checkout: bool = False,
access_username: str | None = None,
access_password: str | None = None,
) -> ScanSummary:
@@ -134,12 +135,20 @@ class RegistryService:
)
try:
if source_path is None:
checkout = self.ingestion.resolve(
repository.url,
branch=repository.branch,
access_username=access_username,
access_password=access_password,
)
if use_cached_checkout:
checkout = self.ingestion.cached_checkout(repository.url)
if checkout is None:
raise RuntimeError(
"cached checkout was requested, but no checkout exists "
"for this repository"
)
else:
checkout = self.ingestion.resolve(
repository.url,
branch=repository.branch,
access_username=access_username,
access_password=access_password,
)
scan_source = checkout.source_path
else:
scan_source = source_path

View File

@@ -57,6 +57,16 @@ class GitIngestionService:
)
return Checkout(source_path=checkout_path.resolve(), was_cloned=True)
def cached_checkout(self, url_or_path: str) -> Checkout | None:
local_path = self._local_path(url_or_path)
if local_path is not None:
return Checkout(source_path=local_path.resolve(), was_cloned=False)
checkout_path = self.checkout_root / self._checkout_key(url_or_path)
if not checkout_path.exists():
return None
return Checkout(source_path=checkout_path.resolve(), was_cloned=True)
def _checkout_branch(
self,
checkout_path: Path,

View File

@@ -245,6 +245,7 @@ def create_analysis_run(
summary = service.analyze_repository(
repository_id,
source_path=payload.source_path,
use_cached_checkout=payload.use_cached_checkout,
access_username=payload.access_username,
access_password=payload.access_password,
)

View File

@@ -202,6 +202,7 @@ class EvidenceUpdate(BaseModel):
class AnalysisRunCreate(BaseModel):
source_path: str | None = None
use_cached_checkout: bool = False
access_username: str | None = None
access_password: str | None = Field(default=None, repr=False)
@@ -210,6 +211,7 @@ class AnalysisRunCreate(BaseModel):
"examples": [
{},
{"source_path": "/path/to/local/repository"},
{"use_cached_checkout": True},
{
"access_username": "git-user",
"access_password": "access-token",

View File

@@ -501,6 +501,7 @@ def repository_detail(
<h2>Run Analysis</h2>
<form class="stack" method="post" action="/ui/repos/{repository_id}/analysis-runs">
<label>Override source path <input name="source_path" placeholder="Optional local path"></label>
<label class="checkbox"><input type="checkbox" name="use_cached_checkout" value="1"> Analyze cached checkout without fetching upstream</label>
<label>Username <input name="access_username" autocomplete="username" placeholder="Optional for private HTTP(S) repos"></label>
<label>Password or access token <input name="access_password" type="password" autocomplete="current-password" placeholder="Used for this Git operation only"></label>
<div class="actions">
@@ -516,6 +517,7 @@ def repository_detail(
</section>
<section class="panel">
<h2>Approved Ability Map</h2>
{render_graph_counts(asdict(ability_map), facts_count=None)}
{render_ability_map(asdict(ability_map), repository_id)}
</section>
</div>
@@ -821,6 +823,7 @@ def delete_evidence_from_form(
def create_analysis_run_from_form(
repository_id: int,
source_path: str = Form(""),
use_cached_checkout: str | None = Form(None),
access_username: str = Form(""),
access_password: str = Form(""),
service: RegistryService = Depends(get_service),
@@ -828,6 +831,7 @@ def create_analysis_run_from_form(
summary = service.analyze_repository(
repository_id,
source_path=source_path or None,
use_cached_checkout=bool(use_cached_checkout),
access_username=access_username or None,
access_password=access_password or None,
)
@@ -869,6 +873,7 @@ def analysis_run_detail(
<section class="panel">
<div class="actions">
<h2 style="margin-right:auto">Candidate Graph</h2>
{render_graph_counts(asdict(candidate_graph), facts_count=len(facts))}
<form method="post" action="/ui/repos/{repository_id}/analysis-runs/{analysis_run_id}/candidate-graph/approve">
<button type="submit">Approve</button>
</form>
@@ -876,7 +881,10 @@ def analysis_run_detail(
{render_candidate_graph(asdict(candidate_graph), repository_id, analysis_run_id)}
</section>
<section class="panel">
<h2>Observed Facts</h2>
<div class="actions">
<h2 style="margin-right:auto">Observed Facts</h2>
{render_count_pills(facts=len(facts))}
</div>
<table>
<thead><tr><th>Kind</th><th>Name</th><th>Path</th><th>Value</th></tr></thead>
<tbody>{fact_rows or '<tr><td colspan="4" class="muted">No observed facts.</td></tr>'}</tbody>
@@ -1565,6 +1573,41 @@ def split_capability_lines(value: str) -> list[str]:
return [line.strip() for line in normalized.splitlines() if line.strip()]
def render_graph_counts(graph: dict, facts_count: int | None = None) -> str:
abilities = graph.get("abilities", [])
capabilities = [
capability
for ability in abilities
for capability in ability.get("capabilities", [])
]
features = [
feature
for capability in capabilities
for feature in capability.get("features", [])
]
counts: dict[str, int] = {
"abilities": len(abilities),
"capabilities": len(capabilities),
"features": len(features),
}
if facts_count is not None:
counts["facts"] = facts_count
return render_count_pills(**counts)
def render_count_pills(**counts: int) -> str:
labels = {
"abilities": "abilities",
"capabilities": "capabilities",
"features": "features",
"facts": "facts",
}
return "".join(
f'<span class="pill">{count} {labels[name]}</span>'
for name, count in counts.items()
)
def render_candidate_graph(graph: dict, repository_id: int, analysis_run_id: int) -> str:
abilities = graph.get("abilities", [])
if not abilities:

View File

@@ -208,3 +208,39 @@ def test_candidate_generator_uses_generic_io_for_unknown_interfaces():
capability = graph[0].capabilities[0]
assert capability.inputs == ["caller input"]
assert capability.outputs == ["callable interface result"]
def test_candidate_generator_groups_many_interface_facts_into_behavioral_features():
repository = Repository(
id=1,
name="Registry",
url="/tmp/registry",
description=None,
branch="main",
status="analyzed",
)
facts = [
fact(1, "documentation", "README", "README.md"),
fact(2, "interface", "python route decorator", "src/api.py", '@app.get("/repos")'),
fact(3, "interface", "python route decorator", "src/api.py", '@app.post("/repos")'),
fact(
4,
"interface",
"python route decorator",
"src/api.py",
'@app.post("/repos/{repository_id}/analysis-runs")',
),
fact(5, "test", "test_api.py", "tests/test_api.py"),
]
graph = CandidateGraphGenerator().generate(repository, facts)
capability = graph[0].capabilities[0]
assert len(capability.features) == 1
feature = capability.features[0]
assert feature.name == (
"HTTP API surface: GET /repos, POST /repos, POST /repos/{repository_id}/analysis-runs"
)
assert feature.type == "API"
assert feature.location == "src/api.py"
assert len(feature.source_refs) == 3

View File

@@ -267,7 +267,6 @@ def test_search_filters_by_status_language_and_framework(tmp_path):
" return {}\n",
encoding="utf-8",
)
service = make_service(tmp_path)
repository = service.register_repository(name="Filterable", url=str(source))
summary = service.analyze_repository(repository.id)
@@ -327,7 +326,7 @@ def test_fixture_breadth_python_cli_repo_extracts_reviewable_cli_claims(tmp_path
assert summary.analysis_run.status == "completed"
assert capability.name == "Expose Repository Interface"
assert capability.features[0].type == "CLI"
assert capability.features[0].name == "CLI command main"
assert capability.features[0].name.startswith("CLI command surface:")
assert capability.evidence[0].reference == "tests/test_cli.py"
assert service.ability_map(repository.id).abilities == []
@@ -523,7 +522,6 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
" return {}\n",
encoding="utf-8",
)
service = make_service(tmp_path)
repository = service.register_repository(
name="Example",
@@ -660,6 +658,13 @@ def test_approve_candidate_graph_publishes_ability_map_once(tmp_path):
" return {}\n",
encoding="utf-8",
)
(source / "cli.py").write_text(
"import click\n\n"
"@click.command()\n"
"def health():\n"
" click.echo('ok')\n",
encoding="utf-8",
)
service = make_service(tmp_path)
repository = service.register_repository(name="Example", url=str(source))
@@ -1150,6 +1155,13 @@ def test_merge_candidate_feature_and_evidence_omits_duplicate_leaves(tmp_path):
" return {}\n",
encoding="utf-8",
)
(source / "cli.py").write_text(
"import click\n\n"
"@click.command()\n"
"def health():\n"
" click.echo('ok')\n",
encoding="utf-8",
)
service = make_service(tmp_path)
repository = service.register_repository(name="Merge Leaves", url=str(source))
@@ -1224,6 +1236,36 @@ def test_analyze_repository_clones_git_url_before_scanning(tmp_path):
assert ("framework", "pytest", "requirements.txt") in fact_names
def test_analyze_repository_can_use_cached_checkout_without_fetching(tmp_path, monkeypatch):
service = make_service(tmp_path)
url = "https://example.com/private/repo.git"
cached = tmp_path / "checkouts" / "repo-b5d250ec3c59"
cached.mkdir(parents=True)
(cached / "README.md").write_text("# Cached Repo\n", encoding="utf-8")
def fail_run_git(*args, **kwargs):
raise AssertionError("cached analysis should not run git")
monkeypatch.setattr(service.ingestion, "_run_git", fail_run_git)
repository = service.register_repository(
name="Cached",
url=url,
description="Already cloned.",
)
summary = service.analyze_repository(
repository.id,
use_cached_checkout=True,
)
assert summary.analysis_run.status == "completed"
assert summary.snapshot is not None
assert str(cached) == summary.snapshot.source_path
assert ("documentation", "README", "README.md") in {
(fact.kind, fact.name, fact.path) for fact in summary.facts
}
def test_operational_logging_records_analysis_and_review_events(caplog, tmp_path):
source = tmp_path / "repo"
source.mkdir()

View File

@@ -1031,7 +1031,10 @@ def test_api_source_linked_candidate_and_repo_update_loop(tmp_path):
for capability in ability["capabilities"]
for feature in capability["features"]
}
assert {"GET /status", "GET /ready"} <= second_features
assert any(
"GET /status" in feature_name and "GET /ready" in feature_name
for feature_name in second_features
)
approved_after_reanalysis = client.get(f"/repos/{repository_id}/ability-map")
assert approved_after_reanalysis.status_code == 200
@@ -1060,6 +1063,13 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
" return {}\n",
encoding="utf-8",
)
(source / "cli.py").write_text(
"import click\n\n"
"@click.command()\n"
"def status():\n"
" click.echo('ok')\n",
encoding="utf-8",
)
def override_settings():
return Settings(
@@ -1095,6 +1105,7 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
assert detail_response.status_code == 200
assert "Run Analysis" in detail_response.text
assert "Running analysis..." in detail_response.text
assert "Analyze cached checkout without fetching upstream" in detail_response.text
assert "Repository Metadata" in detail_response.text
edit_repository_response = client.post(
@@ -1127,6 +1138,10 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
run_detail = client.get(run_path)
assert run_detail.status_code == 200
assert "Candidate Graph" in run_detail.text
assert "1 abilities" in run_detail.text
assert "2 capabilities" in run_detail.text
assert "2 features" in run_detail.text
assert "7 facts" in run_detail.text
assert "Content Chunks" in run_detail.text
assert "README.md:1-1" in run_detail.text
assert "ID " in run_detail.text
@@ -1141,6 +1156,9 @@ def test_ui_register_analyze_and_approve_loop(tmp_path):
approved_detail = client.get(approve_response.headers["location"])
assert approved_detail.status_code == 200
assert "Approved Ability Map" in approved_detail.text
assert "1 abilities" in approved_detail.text
assert "2 capabilities" in approved_detail.text
assert "2 features" in approved_detail.text
assert "Review UI Repo Edited Repository Usefulness" in approved_detail.text
assert "Language: Python" in approved_detail.text
assert "Framework: FastAPI" in approved_detail.text
@@ -1635,7 +1653,6 @@ def test_api_rejects_candidate_capability_feature_and_evidence(tmp_path):
" return {}\n",
encoding="utf-8",
)
def override_settings():
return Settings(
database_path=str(tmp_path / "api-reject.sqlite3"),
@@ -1723,7 +1740,6 @@ def test_api_relinks_candidate_feature_and_evidence(tmp_path):
" return {}\n",
encoding="utf-8",
)
def override_settings():
return Settings(
database_path=str(tmp_path / "api-relink.sqlite3"),
@@ -1798,6 +1814,13 @@ def test_api_merges_candidate_capability_feature_and_evidence(tmp_path):
" return {}\n",
encoding="utf-8",
)
(source / "cli.py").write_text(
"import click\n\n"
"@click.command()\n"
"def status_cli():\n"
" click.echo('ok')\n",
encoding="utf-8",
)
def override_settings():
return Settings(

View File

@@ -45,7 +45,7 @@ analysis run in one flow and land on the reviewable candidate graph.
```task
id: RREG-WP-0003-T02
status: todo
status: done
priority: high
state_hub_task_id: "d0d98e1b-8d21-4bdf-af58-edbb34e8a929"
```