Git ingestion part of Milestone 2

This commit is contained in:
2026-04-25 22:37:06 +02:00
parent 3d9032a386
commit ef41a9974a
8 changed files with 185 additions and 7 deletions

View File

@@ -10,6 +10,7 @@ from repo_registry.core.models import (
ScanSummary,
SearchResult,
)
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.repo_scanning.scanner import DeterministicScanner
from repo_registry.storage.sqlite import RegistryStore
@@ -17,9 +18,14 @@ from repo_registry.storage.sqlite import RegistryStore
class RegistryService:
"""Application service for the manual registry MVP."""
def __init__(self, store: RegistryStore) -> None:
def __init__(
self,
store: RegistryStore,
ingestion: GitIngestionService | None = None,
) -> None:
self.store = store
self.scanner = DeterministicScanner()
self.ingestion = ingestion or GitIngestionService()
def register_repository(
self,
@@ -52,7 +58,12 @@ class RegistryService:
run = self.store.create_analysis_run(repository_id)
self.store.update_repository_status(repository_id, "analyzing")
try:
scan_result = self.scanner.scan(source_path or repository.url)
if source_path is None:
checkout = self.ingestion.resolve(repository.url, branch=repository.branch)
scan_source = checkout.source_path
else:
scan_source = source_path
scan_result = self.scanner.scan(scan_source)
except Exception as exc:
failed_run = self.store.fail_analysis_run(repository_id, run.id, str(exc))
return ScanSummary(analysis_run=failed_run, snapshot=None, facts=[])

View File

@@ -0,0 +1 @@
"""Repository checkout and ingestion helpers."""

View File

@@ -0,0 +1,76 @@
from __future__ import annotations
import hashlib
import shutil
import subprocess
from dataclasses import dataclass
from pathlib import Path
from urllib.parse import urlparse
@dataclass(frozen=True)
class Checkout:
source_path: Path
was_cloned: bool
class GitIngestionService:
def __init__(self, checkout_root: str | Path = "var/checkouts") -> None:
self.checkout_root = Path(checkout_root)
def resolve(self, url_or_path: str, *, branch: str = "main") -> Checkout:
local_path = self._local_path(url_or_path)
if local_path is not None:
return Checkout(source_path=local_path.resolve(), was_cloned=False)
checkout_path = self.checkout_root / self._checkout_key(url_or_path)
self.checkout_root.mkdir(parents=True, exist_ok=True)
if checkout_path.exists():
self._run_git(["fetch", "--all", "--prune"], cwd=checkout_path)
else:
self._run_git(["clone", url_or_path, str(checkout_path)], cwd=None)
self._checkout_branch(checkout_path, branch)
return Checkout(source_path=checkout_path.resolve(), was_cloned=True)
def _checkout_branch(self, checkout_path: Path, branch: str) -> None:
if branch:
self._run_git(["checkout", branch], cwd=checkout_path)
self._run_git(["pull", "--ff-only"], cwd=checkout_path)
def _local_path(self, value: str) -> Path | None:
parsed = urlparse(value)
if parsed.scheme:
return None
path = Path(value).expanduser()
if path.exists():
return path
return None
def _checkout_key(self, url: str) -> str:
parsed = urlparse(url)
name = Path(parsed.path.rstrip("/")).name or "repository"
if name.endswith(".git"):
name = name[:-4]
digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:12]
return f"{self._safe_name(name)}-{digest}"
def _safe_name(self, value: str) -> str:
safe = "".join(char if char.isalnum() or char in "-_" else "-" for char in value)
return safe.strip("-") or "repository"
def _run_git(self, args: list[str], *, cwd: Path | None) -> None:
if shutil.which("git") is None:
raise RuntimeError("git executable was not found")
result = subprocess.run(
["git", *args],
cwd=cwd,
check=False,
capture_output=True,
text=True,
timeout=120,
)
if result.returncode != 0:
message = result.stderr.strip() or result.stdout.strip()
raise RuntimeError(f"git {' '.join(args)} failed: {message}")

View File

@@ -7,11 +7,13 @@ from fastapi import Depends, FastAPI, HTTPException
from pydantic import BaseModel, Field
from repo_registry.core.service import RegistryService
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
class Settings(BaseModel):
database_path: str = Field(default="var/repo-registry.sqlite3")
checkout_root: str = Field(default="var/checkouts")
def get_settings() -> Settings:
@@ -23,7 +25,7 @@ def get_service(settings: Settings = Depends(get_settings)) -> RegistryService:
database_path.parent.mkdir(parents=True, exist_ok=True)
store = RegistryStore(database_path)
store.initialize()
return RegistryService(store)
return RegistryService(store, ingestion=GitIngestionService(settings.checkout_root))
class RepositoryCreate(BaseModel):