generated from coulomb/repo-seed
feat(repos): git-fingerprint-based machine-independent repo identity
Add git_fingerprint (root commit SHA-1) to managed_repos as a stable,
machine-independent identifier — identical across every clone regardless
of checkout path, remote URL, or SSH alias.
- Migration n1i2j3k4l5m6: adds git_fingerprint column + non-unique index
(non-unique to support repos that share ancestry via forks/splits)
- GET /repos/by-fingerprint?hash=<sha>[&remote_url=<url>]: lookup by
fingerprint; optional remote_url disambiguates shared-ancestry repos
- GET /repos/by-remote?url=<url>: fallback lookup by remote URL
- consistency_check.py --here [PATH]: auto-detects repo slug from any
local checkout via fingerprint (falls back to remote URL), then auto-
registers host_paths[hostname] so subsequent runs need no override
- --all now includes repos with host_paths[current_hostname], not just
those with local_path
- fix-consistency-here / check-consistency-here Makefile targets
- Fixed _api_get bug: httpx strips query strings when params={} is passed
- Backfilled fingerprints for 14 repos on this host
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
16
Makefile
16
Makefile
@@ -203,6 +203,22 @@ fix-consistency-remote:
|
||||
$(if $(NO_WRITEBACK),--no-writeback,); \
|
||||
e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e
|
||||
|
||||
## Infer repo slug from git remote URL and check: make check-consistency-here [REPO_PATH=/path/to/repo]
|
||||
## Omit REPO_PATH to use the Python script's CWD (i.e. pass an empty --here flag).
|
||||
check-consistency-here:
|
||||
uv run python scripts/consistency_check.py \
|
||||
--here $(if $(REPO_PATH),"$(REPO_PATH)",) \
|
||||
$(if $(API_BASE),--api-base "$(API_BASE)",); \
|
||||
e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e
|
||||
|
||||
## Infer repo slug from git remote URL and fix: make fix-consistency-here [REPO_PATH=/path/to/repo]
|
||||
fix-consistency-here:
|
||||
uv run python scripts/consistency_check.py \
|
||||
--here $(if $(REPO_PATH),"$(REPO_PATH)",) \
|
||||
--fix \
|
||||
$(if $(API_BASE),--api-base "$(API_BASE)",); \
|
||||
e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e
|
||||
|
||||
## Check all registered repos for ADR-001 consistency
|
||||
check-consistency-all:
|
||||
uv run python scripts/consistency_check.py --all $(if $(API_BASE),--api-base "$(API_BASE)",); \
|
||||
|
||||
@@ -27,6 +27,7 @@ class ManagedRepo(Base, TimestampMixin):
|
||||
topic_id: Mapped[uuid.UUID | None] = mapped_column(
|
||||
UUID(as_uuid=True), ForeignKey("topics.id", ondelete="SET NULL"), nullable=True
|
||||
)
|
||||
git_fingerprint: Mapped[str | None] = mapped_column(String(40), nullable=True, index=True)
|
||||
sbom_source: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
last_sbom_at: Mapped[datetime | None] = mapped_column(
|
||||
DateTime(timezone=True), nullable=True
|
||||
|
||||
@@ -65,6 +65,7 @@ async def register_repo(
|
||||
name=body.name,
|
||||
local_path=body.local_path,
|
||||
remote_url=body.remote_url,
|
||||
git_fingerprint=body.git_fingerprint,
|
||||
description=body.description,
|
||||
topic_id=body.topic_id,
|
||||
)
|
||||
@@ -74,6 +75,43 @@ async def register_repo(
|
||||
return repo
|
||||
|
||||
|
||||
@router.get("/by-fingerprint", response_model=list[RepoRead])
|
||||
async def get_repo_by_fingerprint(
|
||||
hash: str,
|
||||
remote_url: str | None = None,
|
||||
session: AsyncSession = Depends(get_session),
|
||||
) -> list[ManagedRepo]:
|
||||
"""Look up repos by git root-commit SHA-1 fingerprint.
|
||||
|
||||
The fingerprint is the output of ``git rev-list --max-parents=0 HEAD`` and
|
||||
is identical across every clone of the same repository. Repos that share
|
||||
git history (forks, monorepo splits) will have the same fingerprint.
|
||||
|
||||
Pass ``remote_url`` to narrow results to a specific remote — useful when
|
||||
multiple repos share the same ancestor commit.
|
||||
|
||||
Returns an empty list if no match is found.
|
||||
"""
|
||||
q = select(ManagedRepo).where(ManagedRepo.git_fingerprint == hash)
|
||||
if remote_url:
|
||||
q = q.where(ManagedRepo.remote_url == remote_url)
|
||||
result = await session.execute(q)
|
||||
return list(result.scalars().all())
|
||||
|
||||
|
||||
@router.get("/by-remote", response_model=RepoRead)
|
||||
async def get_repo_by_remote_url(
|
||||
url: str,
|
||||
session: AsyncSession = Depends(get_session),
|
||||
) -> ManagedRepo:
|
||||
"""Look up a repo by its git remote URL (fallback; prefer /by-fingerprint)."""
|
||||
result = await session.execute(select(ManagedRepo).where(ManagedRepo.remote_url == url))
|
||||
repo = result.scalar_one_or_none()
|
||||
if repo is None:
|
||||
raise HTTPException(status_code=404, detail=f"No repo with remote_url '{url}' found")
|
||||
return repo
|
||||
|
||||
|
||||
@router.get("/doi/summary", response_model=list[DoISummaryEntry])
|
||||
async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]:
|
||||
"""Return DoI tier for all active repos, worst tier first.
|
||||
|
||||
@@ -11,6 +11,7 @@ class RepoCreate(BaseModel):
|
||||
name: str
|
||||
local_path: str | None = None
|
||||
remote_url: str | None = None
|
||||
git_fingerprint: str | None = None
|
||||
description: str | None = None
|
||||
topic_id: uuid.UUID | None = None
|
||||
|
||||
@@ -19,6 +20,7 @@ class RepoUpdate(BaseModel):
|
||||
name: str | None = None
|
||||
local_path: str | None = None
|
||||
remote_url: str | None = None
|
||||
git_fingerprint: str | None = None
|
||||
description: str | None = None
|
||||
topic_id: uuid.UUID | None = None
|
||||
last_state_synced_at: datetime | None = None
|
||||
@@ -40,6 +42,7 @@ class RepoRead(BaseModel):
|
||||
local_path: str | None = None
|
||||
host_paths: dict = {}
|
||||
remote_url: str | None = None
|
||||
git_fingerprint: str | None = None
|
||||
description: str | None = None
|
||||
status: str
|
||||
topic_id: uuid.UUID | None = None
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
"""Add git_fingerprint to managed_repos
|
||||
|
||||
Stores the root commit SHA-1 of the git repository — a machine-independent
|
||||
identifier that is identical across every clone regardless of remote URL,
|
||||
checkout path, or protocol. Used by the consistency checker and other tools
|
||||
to match a locally checked-out repo to its state-hub record without relying
|
||||
on local_path or remote_url (both of which vary per machine).
|
||||
|
||||
Revision ID: n1i2j3k4l5m6
|
||||
Revises: m0h1i2j3k4l5
|
||||
Create Date: 2026-03-28
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
revision = 'n1i2j3k4l5m6'
|
||||
down_revision = 'm0h1i2j3k4l5'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
'managed_repos',
|
||||
sa.Column('git_fingerprint', sa.String(40), nullable=True),
|
||||
)
|
||||
# Non-unique index: repos forked from the same ancestor share a root commit
|
||||
# SHA-1. The index speeds up lookup; disambiguate by remote_url when needed.
|
||||
op.create_index(
|
||||
'ix_managed_repos_git_fingerprint',
|
||||
'managed_repos',
|
||||
['git_fingerprint'],
|
||||
unique=False,
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_index('ix_managed_repos_git_fingerprint', table_name='managed_repos')
|
||||
op.drop_column('managed_repos', 'git_fingerprint')
|
||||
@@ -25,6 +25,7 @@ Checks:
|
||||
Usage:
|
||||
python scripts/consistency_check.py --repo SLUG [--fix] [--no-writeback] [--json] [--api-base URL]
|
||||
python scripts/consistency_check.py --all [--fix] [--no-writeback] [--json] [--api-base URL]
|
||||
python scripts/consistency_check.py --here [PATH] [--fix] [--no-writeback] [--json] [--api-base URL]
|
||||
|
||||
Exit codes:
|
||||
0 — ok (no FAILs; only WARNs/INFOs)
|
||||
@@ -274,11 +275,13 @@ def _inject_task_id_frontmatter_list(
|
||||
def _api_get(api_base: str, path: str, params: dict | None = None) -> Any:
|
||||
if not _HAS_HTTPX:
|
||||
return None
|
||||
if not path.endswith("/"):
|
||||
# Only append trailing slash to the path component, not to query strings
|
||||
if "?" not in path and not path.endswith("/"):
|
||||
path += "/"
|
||||
try:
|
||||
with _httpx.Client(base_url=api_base, timeout=10.0, follow_redirects=True) as c:
|
||||
r = c.get(path, params={k: v for k, v in (params or {}).items() if v is not None})
|
||||
filtered = {k: v for k, v in (params or {}).items() if v is not None}
|
||||
r = c.get(path, params=filtered if filtered else None)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
except Exception:
|
||||
@@ -334,6 +337,85 @@ def resolve_repo_path(repo: dict, override: str | None = None) -> str:
|
||||
return host_paths.get(hostname) or repo.get("local_path") or ""
|
||||
|
||||
|
||||
def _infer_slug_from_path(api_base: str, path: str) -> "tuple[str, str] | None":
|
||||
"""Identify a registered repo from a local checkout path.
|
||||
|
||||
Strategy (in order):
|
||||
1. Root-commit fingerprint — ``git rev-list --max-parents=0 HEAD`` produces
|
||||
the same SHA-1 on every clone, independent of remote URL or checkout path.
|
||||
Looked up via ``GET /repos/by-fingerprint?hash=<sha>``.
|
||||
2. Remote URL fallback — exact string match against ``remote_url`` field.
|
||||
Less reliable across machines (SSH aliases, HTTP vs HTTPS, etc.) but
|
||||
useful when fingerprint is not yet stored.
|
||||
|
||||
Returns ``(slug, git_root)`` on success, ``None`` if no match found.
|
||||
"""
|
||||
try:
|
||||
git_root = subprocess.check_output(
|
||||
["git", "rev-parse", "--show-toplevel"],
|
||||
cwd=path, stderr=subprocess.DEVNULL, text=True,
|
||||
).strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
|
||||
return None
|
||||
|
||||
# Strategy 1: fingerprint lookup (most reliable)
|
||||
try:
|
||||
fingerprint = subprocess.check_output(
|
||||
["git", "rev-list", "--max-parents=0", "HEAD"],
|
||||
cwd=git_root, stderr=subprocess.DEVNULL, text=True,
|
||||
).strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
|
||||
fingerprint = ""
|
||||
|
||||
# Get local remote URL once — used for both disambiguation and fallback
|
||||
try:
|
||||
remote_url = subprocess.check_output(
|
||||
["git", "remote", "get-url", "origin"],
|
||||
cwd=git_root, stderr=subprocess.DEVNULL, text=True,
|
||||
).strip()
|
||||
except (subprocess.CalledProcessError, FileNotFoundError, OSError):
|
||||
remote_url = ""
|
||||
|
||||
if fingerprint:
|
||||
# Try fingerprint + remote URL for precise match
|
||||
if remote_url:
|
||||
import urllib.parse as _up
|
||||
candidates = _api_get(
|
||||
api_base,
|
||||
f"/repos/by-fingerprint?hash={fingerprint}&remote_url={_up.quote(remote_url, safe='')}",
|
||||
)
|
||||
if isinstance(candidates, list) and len(candidates) == 1:
|
||||
return candidates[0]["slug"], git_root
|
||||
|
||||
# Fingerprint alone (works when repos don't share ancestry)
|
||||
candidates = _api_get(api_base, f"/repos/by-fingerprint?hash={fingerprint}")
|
||||
if isinstance(candidates, list):
|
||||
if len(candidates) == 1:
|
||||
return candidates[0]["slug"], git_root
|
||||
if len(candidates) > 1:
|
||||
# Disambiguate: prefer the repo whose slug appears in the git_root path
|
||||
for repo in candidates:
|
||||
if repo["slug"] in git_root:
|
||||
return repo["slug"], git_root
|
||||
# Can't disambiguate — return first match with a warning
|
||||
print(
|
||||
f" WARNING: {len(candidates)} repos share fingerprint {fingerprint[:12]}… "
|
||||
f"— using '{candidates[0]['slug']}'. "
|
||||
"Set remote_url on each repo for accurate matching.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return candidates[0]["slug"], git_root
|
||||
|
||||
# Strategy 2: remote URL exact match (fallback)
|
||||
if remote_url:
|
||||
import urllib.parse as _up
|
||||
repo = _api_get(api_base, f"/repos/by-remote?url={_up.quote(remote_url, safe='')}")
|
||||
if repo and isinstance(repo, dict) and "slug" in repo:
|
||||
return repo["slug"], git_root
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def check_repo(api_base: str, repo_slug: str, repo_path_override: str | None = None) -> ConsistencyReport:
|
||||
"""Run all consistency checks for a registered repo."""
|
||||
repo = _api_get(api_base, f"/repos/{repo_slug}")
|
||||
@@ -1008,9 +1090,25 @@ def fix_repo(
|
||||
"""Run checks then apply all auto-fixable issues. Returns updated report."""
|
||||
report = check_repo(api_base, repo_slug, repo_path_override)
|
||||
|
||||
# Auto-register this machine's path in host_paths so future runs work
|
||||
# without --repo-path. Idempotent: skipped when already correct.
|
||||
repo_path = report.repo_path
|
||||
if repo_path:
|
||||
repo_record = _api_get(api_base, f"/repos/{repo_slug}")
|
||||
if repo_record:
|
||||
hostname = socket.gethostname()
|
||||
if (repo_record.get("host_paths") or {}).get(hostname) != repo_path:
|
||||
result = _api_post(
|
||||
api_base, f"/repos/{repo_slug}/paths/",
|
||||
{"host": hostname, "path": repo_path},
|
||||
)
|
||||
if result and "_error" not in result:
|
||||
report.fixes_applied.append(
|
||||
f"host_paths[{hostname}] → {repo_path}"
|
||||
)
|
||||
|
||||
# T02 — pull gate: warn and skip all write operations when local repo is
|
||||
# behind its remote tracking branch.
|
||||
repo_path = report.repo_path
|
||||
if repo_path and _detect_behind_remote(repo_path):
|
||||
report.add(
|
||||
severity="WARN", check_id="C-16",
|
||||
@@ -1417,7 +1515,9 @@ def main() -> None:
|
||||
group.add_argument("--repo", metavar="SLUG",
|
||||
help="Registered repo slug (e.g. the-custodian)")
|
||||
group.add_argument("--all", action="store_true",
|
||||
help="Run checks against all registered repos with local_path")
|
||||
help="Run checks against all repos with a resolvable path on this host")
|
||||
group.add_argument("--here", metavar="PATH", nargs="?", const="",
|
||||
help="Infer repo slug from git remote URL at PATH (default: CWD)")
|
||||
parser.add_argument("--fix", action="store_true",
|
||||
help="Apply auto-fixable issues (status drift, repo mismatch, etc.)")
|
||||
parser.add_argument("--remote", action="store_true",
|
||||
@@ -1435,25 +1535,53 @@ def main() -> None:
|
||||
help="Output JSON instead of human-readable text")
|
||||
args = parser.parse_args()
|
||||
|
||||
import os as _os
|
||||
no_wb = getattr(args, "no_writeback", False)
|
||||
do_fix = args.fix or args.remote
|
||||
|
||||
# --here: infer slug from git remote URL, then run as single-repo check/fix
|
||||
if args.here is not None:
|
||||
search_path = args.here or _os.getcwd()
|
||||
inferred = _infer_slug_from_path(args.api_base, search_path)
|
||||
if inferred is None:
|
||||
print(
|
||||
f"ERROR: No registered repo with a matching remote_url found at '{search_path}'.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
" Register the repo first: POST /repos/ with remote_url set.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(1)
|
||||
inferred_slug, git_root = inferred
|
||||
print(f" Detected: {inferred_slug} ({git_root})")
|
||||
if do_fix:
|
||||
reports = [fix_repo(args.api_base, inferred_slug, git_root, no_writeback=no_wb)]
|
||||
else:
|
||||
reports = [check_repo(args.api_base, inferred_slug, git_root)]
|
||||
# --remote --all: smart pull+fix across all repos
|
||||
if args.remote and args.all:
|
||||
elif args.remote and args.all:
|
||||
reports = fix_all_remote(args.api_base, no_writeback=no_wb)
|
||||
if not reports:
|
||||
sys.exit(0)
|
||||
else:
|
||||
# Resolve repo list
|
||||
hostname = socket.gethostname()
|
||||
repo_slugs: list[str] = []
|
||||
if args.all:
|
||||
repos = _api_get(args.api_base, "/repos")
|
||||
if not isinstance(repos, list):
|
||||
print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
repo_slugs = [r["slug"] for r in repos if r.get("local_path")]
|
||||
repo_slugs = [
|
||||
r["slug"] for r in repos
|
||||
if r.get("local_path") or (r.get("host_paths") or {}).get(hostname)
|
||||
]
|
||||
if not repo_slugs:
|
||||
print("No repos with local_path registered.", file=sys.stderr)
|
||||
print(
|
||||
f"No repos with a path registered for host '{hostname}'.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(0)
|
||||
else:
|
||||
repo_slugs = [args.repo]
|
||||
|
||||
Reference in New Issue
Block a user