feat(repos): git-fingerprint-based machine-independent repo identity

Add git_fingerprint (root commit SHA-1) to managed_repos as a stable,
machine-independent identifier — identical across every clone regardless
of checkout path, remote URL, or SSH alias.

- Migration n1i2j3k4l5m6: adds git_fingerprint column + non-unique index
  (non-unique to support repos that share ancestry via forks/splits)
- GET /repos/by-fingerprint?hash=<sha>[&remote_url=<url>]: lookup by
  fingerprint; optional remote_url disambiguates shared-ancestry repos
- GET /repos/by-remote?url=<url>: fallback lookup by remote URL
- consistency_check.py --here [PATH]: auto-detects repo slug from any
  local checkout via fingerprint (falls back to remote URL), then auto-
  registers host_paths[hostname] so subsequent runs need no override
- --all now includes repos with host_paths[current_hostname], not just
  those with local_path
- fix-consistency-here / check-consistency-here Makefile targets
- Fixed _api_get bug: httpx strips query strings when params={} is passed
- Backfilled fingerprints for 14 repos on this host

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-28 23:28:22 +01:00
parent 3f96dc035d
commit 1f8ef7f88b
6 changed files with 232 additions and 7 deletions

View File

@@ -203,6 +203,22 @@ fix-consistency-remote:
$(if $(NO_WRITEBACK),--no-writeback,); \
e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e
## Infer repo slug from git remote URL and check: make check-consistency-here [REPO_PATH=/path/to/repo]
## Omit REPO_PATH to use the Python script's CWD (i.e. pass an empty --here flag).
check-consistency-here:
uv run python scripts/consistency_check.py \
--here $(if $(REPO_PATH),"$(REPO_PATH)",) \
$(if $(API_BASE),--api-base "$(API_BASE)",); \
e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e
## Infer repo slug from git remote URL and fix: make fix-consistency-here [REPO_PATH=/path/to/repo]
fix-consistency-here:
uv run python scripts/consistency_check.py \
--here $(if $(REPO_PATH),"$(REPO_PATH)",) \
--fix \
$(if $(API_BASE),--api-base "$(API_BASE)",); \
e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e
## Check all registered repos for ADR-001 consistency
check-consistency-all:
uv run python scripts/consistency_check.py --all $(if $(API_BASE),--api-base "$(API_BASE)",); \

View File

@@ -27,6 +27,7 @@ class ManagedRepo(Base, TimestampMixin):
topic_id: Mapped[uuid.UUID | None] = mapped_column(
UUID(as_uuid=True), ForeignKey("topics.id", ondelete="SET NULL"), nullable=True
)
git_fingerprint: Mapped[str | None] = mapped_column(String(40), nullable=True, index=True)
sbom_source: Mapped[str | None] = mapped_column(Text, nullable=True)
last_sbom_at: Mapped[datetime | None] = mapped_column(
DateTime(timezone=True), nullable=True

View File

@@ -65,6 +65,7 @@ async def register_repo(
name=body.name,
local_path=body.local_path,
remote_url=body.remote_url,
git_fingerprint=body.git_fingerprint,
description=body.description,
topic_id=body.topic_id,
)
@@ -74,6 +75,43 @@ async def register_repo(
return repo
@router.get("/by-fingerprint", response_model=list[RepoRead])
async def get_repo_by_fingerprint(
hash: str,
remote_url: str | None = None,
session: AsyncSession = Depends(get_session),
) -> list[ManagedRepo]:
"""Look up repos by git root-commit SHA-1 fingerprint.
The fingerprint is the output of ``git rev-list --max-parents=0 HEAD`` and
is identical across every clone of the same repository. Repos that share
git history (forks, monorepo splits) will have the same fingerprint.
Pass ``remote_url`` to narrow results to a specific remote — useful when
multiple repos share the same ancestor commit.
Returns an empty list if no match is found.
"""
q = select(ManagedRepo).where(ManagedRepo.git_fingerprint == hash)
if remote_url:
q = q.where(ManagedRepo.remote_url == remote_url)
result = await session.execute(q)
return list(result.scalars().all())
@router.get("/by-remote", response_model=RepoRead)
async def get_repo_by_remote_url(
url: str,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
"""Look up a repo by its git remote URL (fallback; prefer /by-fingerprint)."""
result = await session.execute(select(ManagedRepo).where(ManagedRepo.remote_url == url))
repo = result.scalar_one_or_none()
if repo is None:
raise HTTPException(status_code=404, detail=f"No repo with remote_url '{url}' found")
return repo
@router.get("/doi/summary", response_model=list[DoISummaryEntry])
async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]:
"""Return DoI tier for all active repos, worst tier first.

View File

@@ -11,6 +11,7 @@ class RepoCreate(BaseModel):
name: str
local_path: str | None = None
remote_url: str | None = None
git_fingerprint: str | None = None
description: str | None = None
topic_id: uuid.UUID | None = None
@@ -19,6 +20,7 @@ class RepoUpdate(BaseModel):
name: str | None = None
local_path: str | None = None
remote_url: str | None = None
git_fingerprint: str | None = None
description: str | None = None
topic_id: uuid.UUID | None = None
last_state_synced_at: datetime | None = None
@@ -40,6 +42,7 @@ class RepoRead(BaseModel):
local_path: str | None = None
host_paths: dict = {}
remote_url: str | None = None
git_fingerprint: str | None = None
description: str | None = None
status: str
topic_id: uuid.UUID | None = None

View File

@@ -0,0 +1,39 @@
"""Add git_fingerprint to managed_repos
Stores the root commit SHA-1 of the git repository — a machine-independent
identifier that is identical across every clone regardless of remote URL,
checkout path, or protocol. Used by the consistency checker and other tools
to match a locally checked-out repo to its state-hub record without relying
on local_path or remote_url (both of which vary per machine).
Revision ID: n1i2j3k4l5m6
Revises: m0h1i2j3k4l5
Create Date: 2026-03-28
"""
from alembic import op
import sqlalchemy as sa
revision = 'n1i2j3k4l5m6'
down_revision = 'm0h1i2j3k4l5'
branch_labels = None
depends_on = None
def upgrade() -> None:
op.add_column(
'managed_repos',
sa.Column('git_fingerprint', sa.String(40), nullable=True),
)
# Non-unique index: repos forked from the same ancestor share a root commit
# SHA-1. The index speeds up lookup; disambiguate by remote_url when needed.
op.create_index(
'ix_managed_repos_git_fingerprint',
'managed_repos',
['git_fingerprint'],
unique=False,
)
def downgrade() -> None:
op.drop_index('ix_managed_repos_git_fingerprint', table_name='managed_repos')
op.drop_column('managed_repos', 'git_fingerprint')

View File

@@ -25,6 +25,7 @@ Checks:
Usage:
python scripts/consistency_check.py --repo SLUG [--fix] [--no-writeback] [--json] [--api-base URL]
python scripts/consistency_check.py --all [--fix] [--no-writeback] [--json] [--api-base URL]
python scripts/consistency_check.py --here [PATH] [--fix] [--no-writeback] [--json] [--api-base URL]
Exit codes:
0 — ok (no FAILs; only WARNs/INFOs)
@@ -274,11 +275,13 @@ def _inject_task_id_frontmatter_list(
def _api_get(api_base: str, path: str, params: dict | None = None) -> Any:
if not _HAS_HTTPX:
return None
if not path.endswith("/"):
# Only append trailing slash to the path component, not to query strings
if "?" not in path and not path.endswith("/"):
path += "/"
try:
with _httpx.Client(base_url=api_base, timeout=10.0, follow_redirects=True) as c:
r = c.get(path, params={k: v for k, v in (params or {}).items() if v is not None})
filtered = {k: v for k, v in (params or {}).items() if v is not None}
r = c.get(path, params=filtered if filtered else None)
r.raise_for_status()
return r.json()
except Exception:
@@ -334,6 +337,85 @@ def resolve_repo_path(repo: dict, override: str | None = None) -> str:
return host_paths.get(hostname) or repo.get("local_path") or ""
def _infer_slug_from_path(api_base: str, path: str) -> "tuple[str, str] | None":
"""Identify a registered repo from a local checkout path.
Strategy (in order):
1. Root-commit fingerprint — ``git rev-list --max-parents=0 HEAD`` produces
the same SHA-1 on every clone, independent of remote URL or checkout path.
Looked up via ``GET /repos/by-fingerprint?hash=<sha>``.
2. Remote URL fallback — exact string match against ``remote_url`` field.
Less reliable across machines (SSH aliases, HTTP vs HTTPS, etc.) but
useful when fingerprint is not yet stored.
Returns ``(slug, git_root)`` on success, ``None`` if no match found.
"""
try:
git_root = subprocess.check_output(
["git", "rev-parse", "--show-toplevel"],
cwd=path, stderr=subprocess.DEVNULL, text=True,
).strip()
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
return None
# Strategy 1: fingerprint lookup (most reliable)
try:
fingerprint = subprocess.check_output(
["git", "rev-list", "--max-parents=0", "HEAD"],
cwd=git_root, stderr=subprocess.DEVNULL, text=True,
).strip()
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
fingerprint = ""
# Get local remote URL once — used for both disambiguation and fallback
try:
remote_url = subprocess.check_output(
["git", "remote", "get-url", "origin"],
cwd=git_root, stderr=subprocess.DEVNULL, text=True,
).strip()
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
remote_url = ""
if fingerprint:
# Try fingerprint + remote URL for precise match
if remote_url:
import urllib.parse as _up
candidates = _api_get(
api_base,
f"/repos/by-fingerprint?hash={fingerprint}&remote_url={_up.quote(remote_url, safe='')}",
)
if isinstance(candidates, list) and len(candidates) == 1:
return candidates[0]["slug"], git_root
# Fingerprint alone (works when repos don't share ancestry)
candidates = _api_get(api_base, f"/repos/by-fingerprint?hash={fingerprint}")
if isinstance(candidates, list):
if len(candidates) == 1:
return candidates[0]["slug"], git_root
if len(candidates) > 1:
# Disambiguate: prefer the repo whose slug appears in the git_root path
for repo in candidates:
if repo["slug"] in git_root:
return repo["slug"], git_root
# Can't disambiguate — return first match with a warning
print(
f" WARNING: {len(candidates)} repos share fingerprint {fingerprint[:12]}"
f"— using '{candidates[0]['slug']}'. "
"Set remote_url on each repo for accurate matching.",
file=sys.stderr,
)
return candidates[0]["slug"], git_root
# Strategy 2: remote URL exact match (fallback)
if remote_url:
import urllib.parse as _up
repo = _api_get(api_base, f"/repos/by-remote?url={_up.quote(remote_url, safe='')}")
if repo and isinstance(repo, dict) and "slug" in repo:
return repo["slug"], git_root
return None
def check_repo(api_base: str, repo_slug: str, repo_path_override: str | None = None) -> ConsistencyReport:
"""Run all consistency checks for a registered repo."""
repo = _api_get(api_base, f"/repos/{repo_slug}")
@@ -1008,9 +1090,25 @@ def fix_repo(
"""Run checks then apply all auto-fixable issues. Returns updated report."""
report = check_repo(api_base, repo_slug, repo_path_override)
# Auto-register this machine's path in host_paths so future runs work
# without --repo-path. Idempotent: skipped when already correct.
repo_path = report.repo_path
if repo_path:
repo_record = _api_get(api_base, f"/repos/{repo_slug}")
if repo_record:
hostname = socket.gethostname()
if (repo_record.get("host_paths") or {}).get(hostname) != repo_path:
result = _api_post(
api_base, f"/repos/{repo_slug}/paths/",
{"host": hostname, "path": repo_path},
)
if result and "_error" not in result:
report.fixes_applied.append(
f"host_paths[{hostname}] → {repo_path}"
)
# T02 — pull gate: warn and skip all write operations when local repo is
# behind its remote tracking branch.
repo_path = report.repo_path
if repo_path and _detect_behind_remote(repo_path):
report.add(
severity="WARN", check_id="C-16",
@@ -1417,7 +1515,9 @@ def main() -> None:
group.add_argument("--repo", metavar="SLUG",
help="Registered repo slug (e.g. the-custodian)")
group.add_argument("--all", action="store_true",
help="Run checks against all registered repos with local_path")
help="Run checks against all repos with a resolvable path on this host")
group.add_argument("--here", metavar="PATH", nargs="?", const="",
help="Infer repo slug from git remote URL at PATH (default: CWD)")
parser.add_argument("--fix", action="store_true",
help="Apply auto-fixable issues (status drift, repo mismatch, etc.)")
parser.add_argument("--remote", action="store_true",
@@ -1435,25 +1535,53 @@ def main() -> None:
help="Output JSON instead of human-readable text")
args = parser.parse_args()
import os as _os
no_wb = getattr(args, "no_writeback", False)
do_fix = args.fix or args.remote
# --here: infer slug from git remote URL, then run as single-repo check/fix
if args.here is not None:
search_path = args.here or _os.getcwd()
inferred = _infer_slug_from_path(args.api_base, search_path)
if inferred is None:
print(
f"ERROR: No registered repo with a matching remote_url found at '{search_path}'.",
file=sys.stderr,
)
print(
" Register the repo first: POST /repos/ with remote_url set.",
file=sys.stderr,
)
sys.exit(1)
inferred_slug, git_root = inferred
print(f" Detected: {inferred_slug} ({git_root})")
if do_fix:
reports = [fix_repo(args.api_base, inferred_slug, git_root, no_writeback=no_wb)]
else:
reports = [check_repo(args.api_base, inferred_slug, git_root)]
# --remote --all: smart pull+fix across all repos
if args.remote and args.all:
elif args.remote and args.all:
reports = fix_all_remote(args.api_base, no_writeback=no_wb)
if not reports:
sys.exit(0)
else:
# Resolve repo list
hostname = socket.gethostname()
repo_slugs: list[str] = []
if args.all:
repos = _api_get(args.api_base, "/repos")
if not isinstance(repos, list):
print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr)
sys.exit(1)
repo_slugs = [r["slug"] for r in repos if r.get("local_path")]
repo_slugs = [
r["slug"] for r in repos
if r.get("local_path") or (r.get("host_paths") or {}).get(hostname)
]
if not repo_slugs:
print("No repos with local_path registered.", file=sys.stderr)
print(
f"No repos with a path registered for host '{hostname}'.",
file=sys.stderr,
)
sys.exit(0)
else:
repo_slugs = [args.repo]