Git ingestion part of Milestone 2

This commit is contained in:
2026-04-25 22:37:06 +02:00
parent 3d9032a386
commit ef41a9974a
8 changed files with 185 additions and 7 deletions

View File

@@ -49,7 +49,7 @@ curl 'http://127.0.0.1:8000/search?q=classify'
## Deterministic Analysis
For local development, repository URLs may be local filesystem paths. Trigger a deterministic scan:
For local development, repository URLs may be local filesystem paths. Git URLs, including `file://` URLs, are cloned into `var/checkouts` before scanning. Trigger a deterministic scan:
```bash
curl -X POST http://127.0.0.1:8000/repos/1/analysis-runs \

View File

@@ -10,6 +10,7 @@ from repo_registry.core.models import (
ScanSummary,
SearchResult,
)
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.repo_scanning.scanner import DeterministicScanner
from repo_registry.storage.sqlite import RegistryStore
@@ -17,9 +18,14 @@ from repo_registry.storage.sqlite import RegistryStore
class RegistryService:
"""Application service for the manual registry MVP."""
def __init__(self, store: RegistryStore) -> None:
def __init__(
self,
store: RegistryStore,
ingestion: GitIngestionService | None = None,
) -> None:
self.store = store
self.scanner = DeterministicScanner()
self.ingestion = ingestion or GitIngestionService()
def register_repository(
self,
@@ -52,7 +58,12 @@ class RegistryService:
run = self.store.create_analysis_run(repository_id)
self.store.update_repository_status(repository_id, "analyzing")
try:
scan_result = self.scanner.scan(source_path or repository.url)
if source_path is None:
checkout = self.ingestion.resolve(repository.url, branch=repository.branch)
scan_source = checkout.source_path
else:
scan_source = source_path
scan_result = self.scanner.scan(scan_source)
except Exception as exc:
failed_run = self.store.fail_analysis_run(repository_id, run.id, str(exc))
return ScanSummary(analysis_run=failed_run, snapshot=None, facts=[])

View File

@@ -0,0 +1 @@
"""Repository checkout and ingestion helpers."""

View File

@@ -0,0 +1,76 @@
from __future__ import annotations
import hashlib
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import urlparse
@dataclass(frozen=True)
class Checkout:
source_path: Path
was_cloned: bool
class GitIngestionService:
def __init__(self, checkout_root: str | Path = "var/checkouts") -> None:
self.checkout_root = Path(checkout_root)
def resolve(self, url_or_path: str, *, branch: str = "main") -> Checkout:
local_path = self._local_path(url_or_path)
if local_path is not None:
return Checkout(source_path=local_path.resolve(), was_cloned=False)
checkout_path = self.checkout_root / self._checkout_key(url_or_path)
self.checkout_root.mkdir(parents=True, exist_ok=True)
if checkout_path.exists():
self._run_git(["fetch", "--all", "--prune"], cwd=checkout_path)
else:
self._run_git(["clone", url_or_path, str(checkout_path)], cwd=None)
self._checkout_branch(checkout_path, branch)
return Checkout(source_path=checkout_path.resolve(), was_cloned=True)
def _checkout_branch(self, checkout_path: Path, branch: str) -> None:
if branch:
self._run_git(["checkout", branch], cwd=checkout_path)
self._run_git(["pull", "--ff-only"], cwd=checkout_path)
def _local_path(self, value: str) -> Path | None:
parsed = urlparse(value)
if parsed.scheme:
return None
path = Path(value).expanduser()
if path.exists():
return path
return None
def _checkout_key(self, url: str) -> str:
parsed = urlparse(url)
name = Path(parsed.path.rstrip("/")).name or "repository"
if name.endswith(".git"):
name = name[:-4]
digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:12]
return f"{self._safe_name(name)}-{digest}"
def _safe_name(self, value: str) -> str:
safe = "".join(char if char.isalnum() or char in "-_" else "-" for char in value)
return safe.strip("-") or "repository"
def _run_git(self, args: list[str], *, cwd: Path | None) -> None:
if shutil.which("git") is None:
raise RuntimeError("git executable was not found")
result = subprocess.run(
["git", *args],
cwd=cwd,
check=False,
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
message = result.stderr.strip() or result.stdout.strip()
raise RuntimeError(f"git {' '.join(args)} failed: {message}")

View File

@@ -7,11 +7,13 @@ from fastapi import Depends, FastAPI, HTTPException
from pydantic import BaseModel, Field
from repo_registry.core.service import RegistryService
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
class Settings(BaseModel):
database_path: str = Field(default="var/repo-registry.sqlite3")
checkout_root: str = Field(default="var/checkouts")
def get_settings() -> Settings:
@@ -23,7 +25,7 @@ def get_service(settings: Settings = Depends(get_settings)) -> RegistryService:
database_path.parent.mkdir(parents=True, exist_ok=True)
store = RegistryStore(database_path)
store.initialize()
return RegistryService(store)
return RegistryService(store, ingestion=GitIngestionService(settings.checkout_root))
class RepositoryCreate(BaseModel):

View File

@@ -0,0 +1,47 @@
import subprocess
from repo_registry.repo_ingestion.git import GitIngestionService
def run(command, cwd):
subprocess.run(command, cwd=cwd, check=True, capture_output=True, text=True)
def make_git_repo(path):
path.mkdir()
run(["git", "init", "-b", "main"], path)
run(["git", "config", "user.email", "tests@example.com"], path)
run(["git", "config", "user.name", "Tests"], path)
(path / "README.md").write_text("# Clone Me\n", encoding="utf-8")
(path / "app.py").write_text("print('ok')\n", encoding="utf-8")
run(["git", "add", "."], path)
run(["git", "commit", "-m", "initial"], path)
def test_ingestion_keeps_local_paths_local(tmp_path):
source = tmp_path / "source"
source.mkdir()
checkout = GitIngestionService(tmp_path / "checkouts").resolve(str(source))
assert checkout.source_path == source.resolve()
assert checkout.was_cloned is False
def test_ingestion_clones_file_url(tmp_path):
source = tmp_path / "source"
make_git_repo(source)
checkout = GitIngestionService(tmp_path / "checkouts").resolve(source.as_uri())
assert checkout.was_cloned is True
assert checkout.source_path != source.resolve()
assert (checkout.source_path / "README.md").exists()
branch = subprocess.run(
["git", "branch", "--show-current"],
cwd=checkout.source_path,
check=True,
capture_output=True,
text=True,
).stdout.strip()
assert branch == "main"

View File

@@ -1,11 +1,14 @@
import subprocess
from repo_registry.core.service import RegistryService
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
def make_service(tmp_path):
store = RegistryStore(tmp_path / "registry.sqlite3")
store.initialize()
return RegistryService(store)
return RegistryService(store, ingestion=GitIngestionService(tmp_path / "checkouts"))
def test_manual_registry_builds_ability_map(tmp_path):
@@ -144,3 +147,35 @@ def test_analyze_repository_failure_is_recorded(tmp_path):
assert summary.snapshot is None
assert "does not exist" in (summary.analysis_run.error_message or "")
assert service.get_repository(repository.id).status == "analysis_failed"
def test_analyze_repository_clones_git_url_before_scanning(tmp_path):
source = tmp_path / "git-source"
source.mkdir()
subprocess.run(["git", "init", "-b", "main"], cwd=source, check=True)
subprocess.run(
["git", "config", "user.email", "tests@example.com"],
cwd=source,
check=True,
)
subprocess.run(
["git", "config", "user.name", "Tests"],
cwd=source,
check=True,
)
(source / "README.md").write_text("# Git Source\n", encoding="utf-8")
(source / "requirements.txt").write_text("pytest\n", encoding="utf-8")
subprocess.run(["git", "add", "."], cwd=source, check=True)
subprocess.run(["git", "commit", "-m", "initial"], cwd=source, check=True)
service = make_service(tmp_path)
repository = service.register_repository(name="Git Source", url=source.as_uri())
summary = service.analyze_repository(repository.id)
assert summary.analysis_run.status == "completed"
assert summary.snapshot is not None
assert str(tmp_path / "checkouts") in summary.snapshot.source_path
fact_names = {(fact.kind, fact.name, fact.path) for fact in summary.facts}
assert ("documentation", "README", "README.md") in fact_names
assert ("framework", "pytest", "requirements.txt") in fact_names

View File

@@ -5,7 +5,10 @@ from repo_registry.web_api.app import Settings, app, get_settings
def test_api_manual_registry_loop(tmp_path):
def override_settings():
return Settings(database_path=str(tmp_path / "api.sqlite3"))
return Settings(
database_path=str(tmp_path / "api.sqlite3"),
checkout_root=str(tmp_path / "checkouts"),
)
app.dependency_overrides[get_settings] = override_settings
client = TestClient(app)
@@ -79,7 +82,10 @@ def test_api_analysis_run_loop(tmp_path):
)
def override_settings():
return Settings(database_path=str(tmp_path / "api-analysis.sqlite3"))
return Settings(
database_path=str(tmp_path / "api-analysis.sqlite3"),
checkout_root=str(tmp_path / "api-checkouts"),
)
app.dependency_overrides[get_settings] = override_settings
client = TestClient(app)