From ef41a9974a050647abbfa290908ebe856518bbf2 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sat, 25 Apr 2026 22:37:06 +0200 Subject: [PATCH] Git ingestion part of Milestone 2 --- README.md | 2 +- src/repo_registry/core/service.py | 15 +++- src/repo_registry/repo_ingestion/__init__.py | 1 + src/repo_registry/repo_ingestion/git.py | 76 ++++++++++++++++++++ src/repo_registry/web_api/app.py | 4 +- tests/test_git_ingestion.py | 47 ++++++++++++ tests/test_registry_service.py | 37 +++++++++- tests/test_web_api.py | 10 ++- 8 files changed, 185 insertions(+), 7 deletions(-) create mode 100644 src/repo_registry/repo_ingestion/__init__.py create mode 100644 src/repo_registry/repo_ingestion/git.py create mode 100644 tests/test_git_ingestion.py diff --git a/README.md b/README.md index 2160703..ef1a56e 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ curl 'http://127.0.0.1:8000/search?q=classify' ## Deterministic Analysis -For local development, repository URLs may be local filesystem paths. Trigger a deterministic scan: +For local development, repository URLs may be local filesystem paths. Git URLs, including `file://` URLs, are cloned into `var/checkouts` before scanning. Trigger a deterministic scan: ```bash curl -X POST http://127.0.0.1:8000/repos/1/analysis-runs \ diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 25c09db..9e48f27 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -10,6 +10,7 @@ from repo_registry.core.models import ( ScanSummary, SearchResult, ) +from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.repo_scanning.scanner import DeterministicScanner from repo_registry.storage.sqlite import RegistryStore @@ -17,9 +18,14 @@ from repo_registry.storage.sqlite import RegistryStore class RegistryService: """Application service for the manual registry MVP.""" - def __init__(self, store: RegistryStore) -> None: + def __init__( + self, + store: RegistryStore, + ingestion: GitIngestionService | None = None, + ) -> None: self.store = store self.scanner = DeterministicScanner() + self.ingestion = ingestion or GitIngestionService() def register_repository( self, @@ -52,7 +58,12 @@ class RegistryService: run = self.store.create_analysis_run(repository_id) self.store.update_repository_status(repository_id, "analyzing") try: - scan_result = self.scanner.scan(source_path or repository.url) + if source_path is None: + checkout = self.ingestion.resolve(repository.url, branch=repository.branch) + scan_source = checkout.source_path + else: + scan_source = source_path + scan_result = self.scanner.scan(scan_source) except Exception as exc: failed_run = self.store.fail_analysis_run(repository_id, run.id, str(exc)) return ScanSummary(analysis_run=failed_run, snapshot=None, facts=[]) diff --git a/src/repo_registry/repo_ingestion/__init__.py b/src/repo_registry/repo_ingestion/__init__.py new file mode 100644 index 0000000..63a4ece --- /dev/null +++ b/src/repo_registry/repo_ingestion/__init__.py @@ -0,0 +1 @@ +"""Repository checkout and ingestion helpers.""" diff --git a/src/repo_registry/repo_ingestion/git.py b/src/repo_registry/repo_ingestion/git.py new file mode 100644 index 0000000..c65f243 --- /dev/null +++ b/src/repo_registry/repo_ingestion/git.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +import hashlib +import shutil +import subprocess +from dataclasses import dataclass +from pathlib import Path +from urllib.parse import urlparse + + +@dataclass(frozen=True) +class Checkout: + source_path: Path + was_cloned: bool + + +class GitIngestionService: + def __init__(self, checkout_root: str | Path = "var/checkouts") -> None: + self.checkout_root = Path(checkout_root) + + def resolve(self, url_or_path: str, *, branch: str = "main") -> Checkout: + local_path = self._local_path(url_or_path) + if local_path is not None: + return Checkout(source_path=local_path.resolve(), was_cloned=False) + + checkout_path = self.checkout_root / self._checkout_key(url_or_path) + self.checkout_root.mkdir(parents=True, exist_ok=True) + if checkout_path.exists(): + self._run_git(["fetch", "--all", "--prune"], cwd=checkout_path) + else: + self._run_git(["clone", url_or_path, str(checkout_path)], cwd=None) + + self._checkout_branch(checkout_path, branch) + return Checkout(source_path=checkout_path.resolve(), was_cloned=True) + + def _checkout_branch(self, checkout_path: Path, branch: str) -> None: + if branch: + self._run_git(["checkout", branch], cwd=checkout_path) + self._run_git(["pull", "--ff-only"], cwd=checkout_path) + + def _local_path(self, value: str) -> Path | None: + parsed = urlparse(value) + if parsed.scheme: + return None + + path = Path(value).expanduser() + if path.exists(): + return path + return None + + def _checkout_key(self, url: str) -> str: + parsed = urlparse(url) + name = Path(parsed.path.rstrip("/")).name or "repository" + if name.endswith(".git"): + name = name[:-4] + digest = hashlib.sha256(url.encode("utf-8")).hexdigest()[:12] + return f"{self._safe_name(name)}-{digest}" + + def _safe_name(self, value: str) -> str: + safe = "".join(char if char.isalnum() or char in "-_" else "-" for char in value) + return safe.strip("-") or "repository" + + def _run_git(self, args: list[str], *, cwd: Path | None) -> None: + if shutil.which("git") is None: + raise RuntimeError("git executable was not found") + result = subprocess.run( + ["git", *args], + cwd=cwd, + check=False, + capture_output=True, + text=True, + timeout=120, + ) + if result.returncode != 0: + message = result.stderr.strip() or result.stdout.strip() + raise RuntimeError(f"git {' '.join(args)} failed: {message}") diff --git a/src/repo_registry/web_api/app.py b/src/repo_registry/web_api/app.py index f4b644f..6b4dbb3 100644 --- a/src/repo_registry/web_api/app.py +++ b/src/repo_registry/web_api/app.py @@ -7,11 +7,13 @@ from fastapi import Depends, FastAPI, HTTPException from pydantic import BaseModel, Field from repo_registry.core.service import RegistryService +from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.storage.sqlite import NotFoundError, RegistryStore class Settings(BaseModel): database_path: str = Field(default="var/repo-registry.sqlite3") + checkout_root: str = Field(default="var/checkouts") def get_settings() -> Settings: @@ -23,7 +25,7 @@ def get_service(settings: Settings = Depends(get_settings)) -> RegistryService: database_path.parent.mkdir(parents=True, exist_ok=True) store = RegistryStore(database_path) store.initialize() - return RegistryService(store) + return RegistryService(store, ingestion=GitIngestionService(settings.checkout_root)) class RepositoryCreate(BaseModel): diff --git a/tests/test_git_ingestion.py b/tests/test_git_ingestion.py new file mode 100644 index 0000000..ebb19c5 --- /dev/null +++ b/tests/test_git_ingestion.py @@ -0,0 +1,47 @@ +import subprocess + +from repo_registry.repo_ingestion.git import GitIngestionService + + +def run(command, cwd): + subprocess.run(command, cwd=cwd, check=True, capture_output=True, text=True) + + +def make_git_repo(path): + path.mkdir() + run(["git", "init", "-b", "main"], path) + run(["git", "config", "user.email", "tests@example.com"], path) + run(["git", "config", "user.name", "Tests"], path) + (path / "README.md").write_text("# Clone Me\n", encoding="utf-8") + (path / "app.py").write_text("print('ok')\n", encoding="utf-8") + run(["git", "add", "."], path) + run(["git", "commit", "-m", "initial"], path) + + +def test_ingestion_keeps_local_paths_local(tmp_path): + source = tmp_path / "source" + source.mkdir() + + checkout = GitIngestionService(tmp_path / "checkouts").resolve(str(source)) + + assert checkout.source_path == source.resolve() + assert checkout.was_cloned is False + + +def test_ingestion_clones_file_url(tmp_path): + source = tmp_path / "source" + make_git_repo(source) + + checkout = GitIngestionService(tmp_path / "checkouts").resolve(source.as_uri()) + + assert checkout.was_cloned is True + assert checkout.source_path != source.resolve() + assert (checkout.source_path / "README.md").exists() + branch = subprocess.run( + ["git", "branch", "--show-current"], + cwd=checkout.source_path, + check=True, + capture_output=True, + text=True, + ).stdout.strip() + assert branch == "main" diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index 6da7589..8598636 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -1,11 +1,14 @@ +import subprocess + from repo_registry.core.service import RegistryService +from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.storage.sqlite import NotFoundError, RegistryStore def make_service(tmp_path): store = RegistryStore(tmp_path / "registry.sqlite3") store.initialize() - return RegistryService(store) + return RegistryService(store, ingestion=GitIngestionService(tmp_path / "checkouts")) def test_manual_registry_builds_ability_map(tmp_path): @@ -144,3 +147,35 @@ def test_analyze_repository_failure_is_recorded(tmp_path): assert summary.snapshot is None assert "does not exist" in (summary.analysis_run.error_message or "") assert service.get_repository(repository.id).status == "analysis_failed" + + +def test_analyze_repository_clones_git_url_before_scanning(tmp_path): + source = tmp_path / "git-source" + source.mkdir() + subprocess.run(["git", "init", "-b", "main"], cwd=source, check=True) + subprocess.run( + ["git", "config", "user.email", "tests@example.com"], + cwd=source, + check=True, + ) + subprocess.run( + ["git", "config", "user.name", "Tests"], + cwd=source, + check=True, + ) + (source / "README.md").write_text("# Git Source\n", encoding="utf-8") + (source / "requirements.txt").write_text("pytest\n", encoding="utf-8") + subprocess.run(["git", "add", "."], cwd=source, check=True) + subprocess.run(["git", "commit", "-m", "initial"], cwd=source, check=True) + + service = make_service(tmp_path) + repository = service.register_repository(name="Git Source", url=source.as_uri()) + + summary = service.analyze_repository(repository.id) + + assert summary.analysis_run.status == "completed" + assert summary.snapshot is not None + assert str(tmp_path / "checkouts") in summary.snapshot.source_path + fact_names = {(fact.kind, fact.name, fact.path) for fact in summary.facts} + assert ("documentation", "README", "README.md") in fact_names + assert ("framework", "pytest", "requirements.txt") in fact_names diff --git a/tests/test_web_api.py b/tests/test_web_api.py index cade537..7b0b9ef 100644 --- a/tests/test_web_api.py +++ b/tests/test_web_api.py @@ -5,7 +5,10 @@ from repo_registry.web_api.app import Settings, app, get_settings def test_api_manual_registry_loop(tmp_path): def override_settings(): - return Settings(database_path=str(tmp_path / "api.sqlite3")) + return Settings( + database_path=str(tmp_path / "api.sqlite3"), + checkout_root=str(tmp_path / "checkouts"), + ) app.dependency_overrides[get_settings] = override_settings client = TestClient(app) @@ -79,7 +82,10 @@ def test_api_analysis_run_loop(tmp_path): ) def override_settings(): - return Settings(database_path=str(tmp_path / "api-analysis.sqlite3")) + return Settings( + database_path=str(tmp_path / "api-analysis.sqlite3"), + checkout_root=str(tmp_path / "api-checkouts"), + ) app.dependency_overrides[get_settings] = override_settings client = TestClient(app)