feat(repos): git-fingerprint-based machine-independent repo identity

Add git_fingerprint (root commit SHA-1) to managed_repos as a stable,
machine-independent identifier — identical across every clone regardless
of checkout path, remote URL, or SSH alias.

- Migration n1i2j3k4l5m6: adds git_fingerprint column + non-unique index
  (non-unique to support repos that share ancestry via forks/splits)
- GET /repos/by-fingerprint?hash=<sha>[&remote_url=<url>]: lookup by
  fingerprint; optional remote_url disambiguates shared-ancestry repos
- GET /repos/by-remote?url=<url>: fallback lookup by remote URL
- consistency_check.py --here [PATH]: auto-detects repo slug from any
  local checkout via fingerprint (falls back to remote URL), then auto-
  registers host_paths[hostname] so subsequent runs need no override
- --all now includes repos with host_paths[current_hostname], not just
  those with local_path
- fix-consistency-here / check-consistency-here Makefile targets
- Fixed _api_get bug: httpx strips query strings when params={} is passed
- Backfilled fingerprints for 14 repos on this host

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-28 23:28:22 +01:00
parent 3f96dc035d
commit 1f8ef7f88b
6 changed files with 232 additions and 7 deletions

View File

@@ -27,6 +27,7 @@ class ManagedRepo(Base, TimestampMixin):
topic_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("topics.id", ondelete="SET NULL"), nullable=True
)
git_fingerprint: Mapped[str | None] = mapped_column(String(40), nullable=True, index=True)
sbom_source: Mapped[str | None] = mapped_column(Text, nullable=True)
last_sbom_at: Mapped[datetime | None] = mapped_column(
DateTime(timezone=True), nullable=True

View File

@@ -65,6 +65,7 @@ async def register_repo(
name=body.name,
local_path=body.local_path,
remote_url=body.remote_url,
git_fingerprint=body.git_fingerprint,
description=body.description,
topic_id=body.topic_id,
)
@@ -74,6 +75,43 @@ async def register_repo(
return repo
@router.get("/by-fingerprint", response_model=list[RepoRead])
async def get_repo_by_fingerprint(
hash: str,
remote_url: str | None = None,
session: AsyncSession = Depends(get_session),
) -> list[ManagedRepo]:
"""Look up repos by git root-commit SHA-1 fingerprint.
The fingerprint is the output of ``git rev-list --max-parents=0 HEAD`` and
is identical across every clone of the same repository. Repos that share
git history (forks, monorepo splits) will have the same fingerprint.
Pass ``remote_url`` to narrow results to a specific remote — useful when
multiple repos share the same ancestor commit.
Returns an empty list if no match is found.
"""
q = select(ManagedRepo).where(ManagedRepo.git_fingerprint == hash)
if remote_url:
q = q.where(ManagedRepo.remote_url == remote_url)
result = await session.execute(q)
return list(result.scalars().all())
@router.get("/by-remote", response_model=RepoRead)
async def get_repo_by_remote_url(
url: str,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
"""Look up a repo by its git remote URL (fallback; prefer /by-fingerprint)."""
result = await session.execute(select(ManagedRepo).where(ManagedRepo.remote_url == url))
repo = result.scalar_one_or_none()
if repo is None:
raise HTTPException(status_code=404, detail=f"No repo with remote_url '{url}' found")
return repo
@router.get("/doi/summary", response_model=list[DoISummaryEntry])
async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]:
"""Return DoI tier for all active repos, worst tier first.

View File

@@ -11,6 +11,7 @@ class RepoCreate(BaseModel):
name: str
local_path: str | None = None
remote_url: str | None = None
git_fingerprint: str | None = None
description: str | None = None
topic_id: uuid.UUID | None = None
@@ -19,6 +20,7 @@ class RepoUpdate(BaseModel):
name: str | None = None
local_path: str | None = None
remote_url: str | None = None
git_fingerprint: str | None = None
description: str | None = None
topic_id: uuid.UUID | None = None
last_state_synced_at: datetime | None = None
@@ -40,6 +42,7 @@ class RepoRead(BaseModel):
local_path: str | None = None
host_paths: dict = {}
remote_url: str | None = None
git_fingerprint: str | None = None
description: str | None = None
status: str
topic_id: uuid.UUID | None = None