diff --git a/CLAUDE.md b/CLAUDE.md index ebdad5e..a109028 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -122,6 +122,13 @@ Every Claude Code session in this repository must follow this ritual: cd state-hub && make fix-consistency-remote REPO= ``` + **On a machine where the checkout path differs from what's in the DB**, use + `--here` to auto-detect the slug from the git root-commit fingerprint: + ```bash + cd state-hub && make fix-consistency-here REPO_PATH=/path/to/repo + ``` + This also auto-registers `host_paths[hostname]` so subsequent runs need no override. + **Workplan ↔ DB sync rule (prevents ghost workstreams):** When creating a new workstream backed by a workplan file, **always write the file first, then run `make fix-consistency`** — never call `create_workstream()` / diff --git a/state-hub/Makefile b/state-hub/Makefile index a561d91..35f21c9 100644 --- a/state-hub/Makefile +++ b/state-hub/Makefile @@ -203,6 +203,22 @@ fix-consistency-remote: $(if $(NO_WRITEBACK),--no-writeback,); \ e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e +## Infer repo slug from git remote URL and check: make check-consistency-here [REPO_PATH=/path/to/repo] +## Omit REPO_PATH to use the Python script's CWD (i.e. pass an empty --here flag). +check-consistency-here: + uv run python scripts/consistency_check.py \ + --here $(if $(REPO_PATH),"$(REPO_PATH)",) \ + $(if $(API_BASE),--api-base "$(API_BASE)",); \ + e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e + +## Infer repo slug from git remote URL and fix: make fix-consistency-here [REPO_PATH=/path/to/repo] +fix-consistency-here: + uv run python scripts/consistency_check.py \ + --here $(if $(REPO_PATH),"$(REPO_PATH)",) \ + --fix \ + $(if $(API_BASE),--api-base "$(API_BASE)",); \ + e=$$?; [ $$e -eq 2 ] && exit 0 || exit $$e + ## Check all registered repos for ADR-001 consistency check-consistency-all: uv run python scripts/consistency_check.py --all $(if $(API_BASE),--api-base "$(API_BASE)",); \ diff --git a/state-hub/api/models/managed_repo.py b/state-hub/api/models/managed_repo.py index 8ce6027..9e1f1ed 100644 --- a/state-hub/api/models/managed_repo.py +++ b/state-hub/api/models/managed_repo.py @@ -27,6 +27,7 @@ class ManagedRepo(Base, TimestampMixin): topic_id: Mapped[uuid.UUID | None] = mapped_column( UUID(as_uuid=True), ForeignKey("topics.id", ondelete="SET NULL"), nullable=True ) + git_fingerprint: Mapped[str | None] = mapped_column(String(40), nullable=True, index=True) sbom_source: Mapped[str | None] = mapped_column(Text, nullable=True) last_sbom_at: Mapped[datetime | None] = mapped_column( DateTime(timezone=True), nullable=True diff --git a/state-hub/api/routers/repos.py b/state-hub/api/routers/repos.py index 02e39c2..5365c94 100644 --- a/state-hub/api/routers/repos.py +++ b/state-hub/api/routers/repos.py @@ -65,6 +65,7 @@ async def register_repo( name=body.name, local_path=body.local_path, remote_url=body.remote_url, + git_fingerprint=body.git_fingerprint, description=body.description, topic_id=body.topic_id, ) @@ -74,6 +75,43 @@ async def register_repo( return repo +@router.get("/by-fingerprint", response_model=list[RepoRead]) +async def get_repo_by_fingerprint( + hash: str, + remote_url: str | None = None, + session: AsyncSession = Depends(get_session), +) -> list[ManagedRepo]: + """Look up repos by git root-commit SHA-1 fingerprint. + + The fingerprint is the output of ``git rev-list --max-parents=0 HEAD`` and + is identical across every clone of the same repository. Repos that share + git history (forks, monorepo splits) will have the same fingerprint. + + Pass ``remote_url`` to narrow results to a specific remote — useful when + multiple repos share the same ancestor commit. + + Returns an empty list if no match is found. + """ + q = select(ManagedRepo).where(ManagedRepo.git_fingerprint == hash) + if remote_url: + q = q.where(ManagedRepo.remote_url == remote_url) + result = await session.execute(q) + return list(result.scalars().all()) + + +@router.get("/by-remote", response_model=RepoRead) +async def get_repo_by_remote_url( + url: str, + session: AsyncSession = Depends(get_session), +) -> ManagedRepo: + """Look up a repo by its git remote URL (fallback; prefer /by-fingerprint).""" + result = await session.execute(select(ManagedRepo).where(ManagedRepo.remote_url == url)) + repo = result.scalar_one_or_none() + if repo is None: + raise HTTPException(status_code=404, detail=f"No repo with remote_url '{url}' found") + return repo + + @router.get("/doi/summary", response_model=list[DoISummaryEntry]) async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]: """Return DoI tier for all active repos, worst tier first. diff --git a/state-hub/api/schemas/managed_repo.py b/state-hub/api/schemas/managed_repo.py index d549bd3..ebda3f8 100644 --- a/state-hub/api/schemas/managed_repo.py +++ b/state-hub/api/schemas/managed_repo.py @@ -11,6 +11,7 @@ class RepoCreate(BaseModel): name: str local_path: str | None = None remote_url: str | None = None + git_fingerprint: str | None = None description: str | None = None topic_id: uuid.UUID | None = None @@ -19,6 +20,7 @@ class RepoUpdate(BaseModel): name: str | None = None local_path: str | None = None remote_url: str | None = None + git_fingerprint: str | None = None description: str | None = None topic_id: uuid.UUID | None = None last_state_synced_at: datetime | None = None @@ -40,6 +42,7 @@ class RepoRead(BaseModel): local_path: str | None = None host_paths: dict = {} remote_url: str | None = None + git_fingerprint: str | None = None description: str | None = None status: str topic_id: uuid.UUID | None = None diff --git a/state-hub/migrations/versions/n1i2j3k4l5m6_add_git_fingerprint_to_repos.py b/state-hub/migrations/versions/n1i2j3k4l5m6_add_git_fingerprint_to_repos.py new file mode 100644 index 0000000..d11dc21 --- /dev/null +++ b/state-hub/migrations/versions/n1i2j3k4l5m6_add_git_fingerprint_to_repos.py @@ -0,0 +1,39 @@ +"""Add git_fingerprint to managed_repos + +Stores the root commit SHA-1 of the git repository — a machine-independent +identifier that is identical across every clone regardless of remote URL, +checkout path, or protocol. Used by the consistency checker and other tools +to match a locally checked-out repo to its state-hub record without relying +on local_path or remote_url (both of which vary per machine). + +Revision ID: n1i2j3k4l5m6 +Revises: m0h1i2j3k4l5 +Create Date: 2026-03-28 +""" +from alembic import op +import sqlalchemy as sa + +revision = 'n1i2j3k4l5m6' +down_revision = 'm0h1i2j3k4l5' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.add_column( + 'managed_repos', + sa.Column('git_fingerprint', sa.String(40), nullable=True), + ) + # Non-unique index: repos forked from the same ancestor share a root commit + # SHA-1. The index speeds up lookup; disambiguate by remote_url when needed. + op.create_index( + 'ix_managed_repos_git_fingerprint', + 'managed_repos', + ['git_fingerprint'], + unique=False, + ) + + +def downgrade() -> None: + op.drop_index('ix_managed_repos_git_fingerprint', table_name='managed_repos') + op.drop_column('managed_repos', 'git_fingerprint') diff --git a/state-hub/scripts/consistency_check.py b/state-hub/scripts/consistency_check.py index a141f83..0b2b726 100644 --- a/state-hub/scripts/consistency_check.py +++ b/state-hub/scripts/consistency_check.py @@ -25,6 +25,7 @@ Checks: Usage: python scripts/consistency_check.py --repo SLUG [--fix] [--no-writeback] [--json] [--api-base URL] python scripts/consistency_check.py --all [--fix] [--no-writeback] [--json] [--api-base URL] + python scripts/consistency_check.py --here [PATH] [--fix] [--no-writeback] [--json] [--api-base URL] Exit codes: 0 — ok (no FAILs; only WARNs/INFOs) @@ -274,11 +275,13 @@ def _inject_task_id_frontmatter_list( def _api_get(api_base: str, path: str, params: dict | None = None) -> Any: if not _HAS_HTTPX: return None - if not path.endswith("/"): + # Only append trailing slash to the path component, not to query strings + if "?" not in path and not path.endswith("/"): path += "/" try: with _httpx.Client(base_url=api_base, timeout=10.0, follow_redirects=True) as c: - r = c.get(path, params={k: v for k, v in (params or {}).items() if v is not None}) + filtered = {k: v for k, v in (params or {}).items() if v is not None} + r = c.get(path, params=filtered if filtered else None) r.raise_for_status() return r.json() except Exception: @@ -334,6 +337,85 @@ def resolve_repo_path(repo: dict, override: str | None = None) -> str: return host_paths.get(hostname) or repo.get("local_path") or "" +def _infer_slug_from_path(api_base: str, path: str) -> "tuple[str, str] | None": + """Identify a registered repo from a local checkout path. + + Strategy (in order): + 1. Root-commit fingerprint — ``git rev-list --max-parents=0 HEAD`` produces + the same SHA-1 on every clone, independent of remote URL or checkout path. + Looked up via ``GET /repos/by-fingerprint?hash=``. + 2. Remote URL fallback — exact string match against ``remote_url`` field. + Less reliable across machines (SSH aliases, HTTP vs HTTPS, etc.) but + useful when fingerprint is not yet stored. + + Returns ``(slug, git_root)`` on success, ``None`` if no match found. + """ + try: + git_root = subprocess.check_output( + ["git", "rev-parse", "--show-toplevel"], + cwd=path, stderr=subprocess.DEVNULL, text=True, + ).strip() + except (subprocess.CalledProcessError, FileNotFoundError, OSError): + return None + + # Strategy 1: fingerprint lookup (most reliable) + try: + fingerprint = subprocess.check_output( + ["git", "rev-list", "--max-parents=0", "HEAD"], + cwd=git_root, stderr=subprocess.DEVNULL, text=True, + ).strip() + except (subprocess.CalledProcessError, FileNotFoundError, OSError): + fingerprint = "" + + # Get local remote URL once — used for both disambiguation and fallback + try: + remote_url = subprocess.check_output( + ["git", "remote", "get-url", "origin"], + cwd=git_root, stderr=subprocess.DEVNULL, text=True, + ).strip() + except (subprocess.CalledProcessError, FileNotFoundError, OSError): + remote_url = "" + + if fingerprint: + # Try fingerprint + remote URL for precise match + if remote_url: + import urllib.parse as _up + candidates = _api_get( + api_base, + f"/repos/by-fingerprint?hash={fingerprint}&remote_url={_up.quote(remote_url, safe='')}", + ) + if isinstance(candidates, list) and len(candidates) == 1: + return candidates[0]["slug"], git_root + + # Fingerprint alone (works when repos don't share ancestry) + candidates = _api_get(api_base, f"/repos/by-fingerprint?hash={fingerprint}") + if isinstance(candidates, list): + if len(candidates) == 1: + return candidates[0]["slug"], git_root + if len(candidates) > 1: + # Disambiguate: prefer the repo whose slug appears in the git_root path + for repo in candidates: + if repo["slug"] in git_root: + return repo["slug"], git_root + # Can't disambiguate — return first match with a warning + print( + f" WARNING: {len(candidates)} repos share fingerprint {fingerprint[:12]}… " + f"— using '{candidates[0]['slug']}'. " + "Set remote_url on each repo for accurate matching.", + file=sys.stderr, + ) + return candidates[0]["slug"], git_root + + # Strategy 2: remote URL exact match (fallback) + if remote_url: + import urllib.parse as _up + repo = _api_get(api_base, f"/repos/by-remote?url={_up.quote(remote_url, safe='')}") + if repo and isinstance(repo, dict) and "slug" in repo: + return repo["slug"], git_root + + return None + + def check_repo(api_base: str, repo_slug: str, repo_path_override: str | None = None) -> ConsistencyReport: """Run all consistency checks for a registered repo.""" repo = _api_get(api_base, f"/repos/{repo_slug}") @@ -1008,9 +1090,25 @@ def fix_repo( """Run checks then apply all auto-fixable issues. Returns updated report.""" report = check_repo(api_base, repo_slug, repo_path_override) + # Auto-register this machine's path in host_paths so future runs work + # without --repo-path. Idempotent: skipped when already correct. + repo_path = report.repo_path + if repo_path: + repo_record = _api_get(api_base, f"/repos/{repo_slug}") + if repo_record: + hostname = socket.gethostname() + if (repo_record.get("host_paths") or {}).get(hostname) != repo_path: + result = _api_post( + api_base, f"/repos/{repo_slug}/paths/", + {"host": hostname, "path": repo_path}, + ) + if result and "_error" not in result: + report.fixes_applied.append( + f"host_paths[{hostname}] → {repo_path}" + ) + # T02 — pull gate: warn and skip all write operations when local repo is # behind its remote tracking branch. - repo_path = report.repo_path if repo_path and _detect_behind_remote(repo_path): report.add( severity="WARN", check_id="C-16", @@ -1417,7 +1515,9 @@ def main() -> None: group.add_argument("--repo", metavar="SLUG", help="Registered repo slug (e.g. the-custodian)") group.add_argument("--all", action="store_true", - help="Run checks against all registered repos with local_path") + help="Run checks against all repos with a resolvable path on this host") + group.add_argument("--here", metavar="PATH", nargs="?", const="", + help="Infer repo slug from git remote URL at PATH (default: CWD)") parser.add_argument("--fix", action="store_true", help="Apply auto-fixable issues (status drift, repo mismatch, etc.)") parser.add_argument("--remote", action="store_true", @@ -1435,25 +1535,53 @@ def main() -> None: help="Output JSON instead of human-readable text") args = parser.parse_args() + import os as _os no_wb = getattr(args, "no_writeback", False) do_fix = args.fix or args.remote + # --here: infer slug from git remote URL, then run as single-repo check/fix + if args.here is not None: + search_path = args.here or _os.getcwd() + inferred = _infer_slug_from_path(args.api_base, search_path) + if inferred is None: + print( + f"ERROR: No registered repo with a matching remote_url found at '{search_path}'.", + file=sys.stderr, + ) + print( + " Register the repo first: POST /repos/ with remote_url set.", + file=sys.stderr, + ) + sys.exit(1) + inferred_slug, git_root = inferred + print(f" Detected: {inferred_slug} ({git_root})") + if do_fix: + reports = [fix_repo(args.api_base, inferred_slug, git_root, no_writeback=no_wb)] + else: + reports = [check_repo(args.api_base, inferred_slug, git_root)] # --remote --all: smart pull+fix across all repos - if args.remote and args.all: + elif args.remote and args.all: reports = fix_all_remote(args.api_base, no_writeback=no_wb) if not reports: sys.exit(0) else: # Resolve repo list + hostname = socket.gethostname() repo_slugs: list[str] = [] if args.all: repos = _api_get(args.api_base, "/repos") if not isinstance(repos, list): print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr) sys.exit(1) - repo_slugs = [r["slug"] for r in repos if r.get("local_path")] + repo_slugs = [ + r["slug"] for r in repos + if r.get("local_path") or (r.get("host_paths") or {}).get(hostname) + ] if not repo_slugs: - print("No repos with local_path registered.", file=sys.stderr) + print( + f"No repos with a path registered for host '{hostname}'.", + file=sys.stderr, + ) sys.exit(0) else: repo_slugs = [args.repo]