feat(repos): git-fingerprint-based machine-independent repo identity

Add git_fingerprint (root commit SHA-1) to managed_repos as a stable,
machine-independent identifier — identical across every clone regardless
of checkout path, remote URL, or SSH alias.

- Migration n1i2j3k4l5m6: adds git_fingerprint column + non-unique index
  (non-unique to support repos that share ancestry via forks/splits)
- GET /repos/by-fingerprint?hash=<sha>[&remote_url=<url>]: lookup by
  fingerprint; optional remote_url disambiguates shared-ancestry repos
- GET /repos/by-remote?url=<url>: fallback lookup by remote URL
- consistency_check.py --here [PATH]: auto-detects repo slug from any
  local checkout via fingerprint (falls back to remote URL), then auto-
  registers host_paths[hostname] so subsequent runs need no override
- --all now includes repos with host_paths[current_hostname], not just
  those with local_path
- fix-consistency-here / check-consistency-here Makefile targets
- Fixed _api_get bug: httpx strips query strings when params={} is passed
- Backfilled fingerprints for 14 repos on this host

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-28 23:28:22 +01:00
parent 3f96dc035d
commit 1f8ef7f88b
6 changed files with 232 additions and 7 deletions

View File

@@ -25,6 +25,7 @@ Checks:
Usage:
python scripts/consistency_check.py --repo SLUG [--fix] [--no-writeback] [--json] [--api-base URL]
python scripts/consistency_check.py --all [--fix] [--no-writeback] [--json] [--api-base URL]
python scripts/consistency_check.py --here [PATH] [--fix] [--no-writeback] [--json] [--api-base URL]
Exit codes:
0 — ok (no FAILs; only WARNs/INFOs)
@@ -274,11 +275,13 @@ def _inject_task_id_frontmatter_list(
def _api_get(api_base: str, path: str, params: dict | None = None) -> Any:
if not _HAS_HTTPX:
return None
if not path.endswith("/"):
# Only append trailing slash to the path component, not to query strings
if "?" not in path and not path.endswith("/"):
path += "/"
try:
with _httpx.Client(base_url=api_base, timeout=10.0, follow_redirects=True) as c:
r = c.get(path, params={k: v for k, v in (params or {}).items() if v is not None})
filtered = {k: v for k, v in (params or {}).items() if v is not None}
r = c.get(path, params=filtered if filtered else None)
r.raise_for_status()
return r.json()
except Exception:
@@ -334,6 +337,85 @@ def resolve_repo_path(repo: dict, override: str | None = None) -> str:
return host_paths.get(hostname) or repo.get("local_path") or ""
def _infer_slug_from_path(api_base: str, path: str) -> "tuple[str, str] | None":
"""Identify a registered repo from a local checkout path.
Strategy (in order):
1. Root-commit fingerprint — ``git rev-list --max-parents=0 HEAD`` produces
the same SHA-1 on every clone, independent of remote URL or checkout path.
Looked up via ``GET /repos/by-fingerprint?hash=<sha>``.
2. Remote URL fallback — exact string match against ``remote_url`` field.
Less reliable across machines (SSH aliases, HTTP vs HTTPS, etc.) but
useful when fingerprint is not yet stored.
Returns ``(slug, git_root)`` on success, ``None`` if no match found.
"""
try:
git_root = subprocess.check_output(
["git", "rev-parse", "--show-toplevel"],
cwd=path, stderr=subprocess.DEVNULL, text=True,
).strip()
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
return None
# Strategy 1: fingerprint lookup (most reliable)
try:
fingerprint = subprocess.check_output(
["git", "rev-list", "--max-parents=0", "HEAD"],
cwd=git_root, stderr=subprocess.DEVNULL, text=True,
).strip()
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
fingerprint = ""
# Get local remote URL once — used for both disambiguation and fallback
try:
remote_url = subprocess.check_output(
["git", "remote", "get-url", "origin"],
cwd=git_root, stderr=subprocess.DEVNULL, text=True,
).strip()
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
remote_url = ""
if fingerprint:
# Try fingerprint + remote URL for precise match
if remote_url:
import urllib.parse as _up
candidates = _api_get(
api_base,
f"/repos/by-fingerprint?hash={fingerprint}&remote_url={_up.quote(remote_url, safe='')}",
)
if isinstance(candidates, list) and len(candidates) == 1:
return candidates[0]["slug"], git_root
# Fingerprint alone (works when repos don't share ancestry)
candidates = _api_get(api_base, f"/repos/by-fingerprint?hash={fingerprint}")
if isinstance(candidates, list):
if len(candidates) == 1:
return candidates[0]["slug"], git_root
if len(candidates) > 1:
# Disambiguate: prefer the repo whose slug appears in the git_root path
for repo in candidates:
if repo["slug"] in git_root:
return repo["slug"], git_root
# Can't disambiguate — return first match with a warning
print(
f" WARNING: {len(candidates)} repos share fingerprint {fingerprint[:12]}"
f"— using '{candidates[0]['slug']}'. "
"Set remote_url on each repo for accurate matching.",
file=sys.stderr,
)
return candidates[0]["slug"], git_root
# Strategy 2: remote URL exact match (fallback)
if remote_url:
import urllib.parse as _up
repo = _api_get(api_base, f"/repos/by-remote?url={_up.quote(remote_url, safe='')}")
if repo and isinstance(repo, dict) and "slug" in repo:
return repo["slug"], git_root
return None
def check_repo(api_base: str, repo_slug: str, repo_path_override: str | None = None) -> ConsistencyReport:
"""Run all consistency checks for a registered repo."""
repo = _api_get(api_base, f"/repos/{repo_slug}")
@@ -1008,9 +1090,25 @@ def fix_repo(
"""Run checks then apply all auto-fixable issues. Returns updated report."""
report = check_repo(api_base, repo_slug, repo_path_override)
# Auto-register this machine's path in host_paths so future runs work
# without --repo-path. Idempotent: skipped when already correct.
repo_path = report.repo_path
if repo_path:
repo_record = _api_get(api_base, f"/repos/{repo_slug}")
if repo_record:
hostname = socket.gethostname()
if (repo_record.get("host_paths") or {}).get(hostname) != repo_path:
result = _api_post(
api_base, f"/repos/{repo_slug}/paths/",
{"host": hostname, "path": repo_path},
)
if result and "_error" not in result:
report.fixes_applied.append(
f"host_paths[{hostname}] → {repo_path}"
)
# T02 — pull gate: warn and skip all write operations when local repo is
# behind its remote tracking branch.
repo_path = report.repo_path
if repo_path and _detect_behind_remote(repo_path):
report.add(
severity="WARN", check_id="C-16",
@@ -1417,7 +1515,9 @@ def main() -> None:
group.add_argument("--repo", metavar="SLUG",
help="Registered repo slug (e.g. the-custodian)")
group.add_argument("--all", action="store_true",
help="Run checks against all registered repos with local_path")
help="Run checks against all repos with a resolvable path on this host")
group.add_argument("--here", metavar="PATH", nargs="?", const="",
help="Infer repo slug from git remote URL at PATH (default: CWD)")
parser.add_argument("--fix", action="store_true",
help="Apply auto-fixable issues (status drift, repo mismatch, etc.)")
parser.add_argument("--remote", action="store_true",
@@ -1435,25 +1535,53 @@ def main() -> None:
help="Output JSON instead of human-readable text")
args = parser.parse_args()
import os as _os
no_wb = getattr(args, "no_writeback", False)
do_fix = args.fix or args.remote
# --here: infer slug from git remote URL, then run as single-repo check/fix
if args.here is not None:
search_path = args.here or _os.getcwd()
inferred = _infer_slug_from_path(args.api_base, search_path)
if inferred is None:
print(
f"ERROR: No registered repo with a matching remote_url found at '{search_path}'.",
file=sys.stderr,
)
print(
" Register the repo first: POST /repos/ with remote_url set.",
file=sys.stderr,
)
sys.exit(1)
inferred_slug, git_root = inferred
print(f" Detected: {inferred_slug} ({git_root})")
if do_fix:
reports = [fix_repo(args.api_base, inferred_slug, git_root, no_writeback=no_wb)]
else:
reports = [check_repo(args.api_base, inferred_slug, git_root)]
# --remote --all: smart pull+fix across all repos
if args.remote and args.all:
elif args.remote and args.all:
reports = fix_all_remote(args.api_base, no_writeback=no_wb)
if not reports:
sys.exit(0)
else:
# Resolve repo list
hostname = socket.gethostname()
repo_slugs: list[str] = []
if args.all:
repos = _api_get(args.api_base, "/repos")
if not isinstance(repos, list):
print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr)
sys.exit(1)
repo_slugs = [r["slug"] for r in repos if r.get("local_path")]
repo_slugs = [
r["slug"] for r in repos
if r.get("local_path") or (r.get("host_paths") or {}).get(hostname)
]
if not repo_slugs:
print("No repos with local_path registered.", file=sys.stderr)
print(
f"No repos with a path registered for host '{hostname}'.",
file=sys.stderr,
)
sys.exit(0)
else:
repo_slugs = [args.repo]