Files
state-hub/api/routers/repos.py
tegwick 1f8ef7f88b feat(repos): git-fingerprint-based machine-independent repo identity
Add git_fingerprint (root commit SHA-1) to managed_repos as a stable,
machine-independent identifier — identical across every clone regardless
of checkout path, remote URL, or SSH alias.

- Migration n1i2j3k4l5m6: adds git_fingerprint column + non-unique index
  (non-unique to support repos that share ancestry via forks/splits)
- GET /repos/by-fingerprint?hash=<sha>[&remote_url=<url>]: lookup by
  fingerprint; optional remote_url disambiguates shared-ancestry repos
- GET /repos/by-remote?url=<url>: fallback lookup by remote URL
- consistency_check.py --here [PATH]: auto-detects repo slug from any
  local checkout via fingerprint (falls back to remote URL), then auto-
  registers host_paths[hostname] so subsequent runs need no override
- --all now includes repos with host_paths[current_hostname], not just
  those with local_path
- fix-consistency-here / check-consistency-here Makefile targets
- Fixed _api_get bug: httpx strips query strings when params={} is passed
- Backfilled fingerprints for 14 repos on this host

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-28 23:55:06 +01:00

469 lines
18 KiB
Python

import asyncio
import uuid
from datetime import datetime, timezone
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy import case, func, select
from sqlalchemy.ext.asyncio import AsyncSession
from api.database import get_session
from api.doi_engine import compute_fingerprint, evaluate as _doi_evaluate
from api.models.doi_cache import DOICache
from api.models.domain import Domain
from api.models.managed_repo import ManagedRepo
from api.models.repo_goal import RepoGoal
from api.models.tpsc import TPSCSnapshot
from api.models.task import Task
from api.models.workstream import Workstream
from api.schemas.doi import DoICriterion, DoIReport, DoISummaryEntry
from api.schemas.managed_repo import (
DispatchTask,
DispatchWorkstream,
RepoCreate,
RepoDispatch,
RepoPathRegister,
RepoRead,
RepoUpdate,
)
router = APIRouter(prefix="/repos", tags=["repos"])
@router.get("/", response_model=list[RepoRead])
async def list_repos(
domain: str | None = None,
session: AsyncSession = Depends(get_session),
) -> list[ManagedRepo]:
q = select(ManagedRepo).order_by(ManagedRepo.name)
if domain:
domain_row = await session.execute(select(Domain).where(Domain.slug == domain))
domain_obj = domain_row.scalar_one_or_none()
if domain_obj is None:
raise HTTPException(status_code=404, detail=f"Domain '{domain}' not found")
q = q.where(ManagedRepo.domain_id == domain_obj.id)
result = await session.execute(q)
return list(result.scalars().all())
@router.post("/", response_model=RepoRead, status_code=status.HTTP_201_CREATED)
async def register_repo(
body: RepoCreate,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
domain_row = await session.execute(select(Domain).where(Domain.slug == body.domain_slug))
domain_obj = domain_row.scalar_one_or_none()
if domain_obj is None:
raise HTTPException(status_code=404, detail=f"Domain '{body.domain_slug}' not found")
existing = await session.execute(select(ManagedRepo).where(ManagedRepo.slug == body.slug))
if existing.scalar_one_or_none():
raise HTTPException(status_code=409, detail=f"Repo slug '{body.slug}' already exists")
repo = ManagedRepo(
domain_id=domain_obj.id,
slug=body.slug,
name=body.name,
local_path=body.local_path,
remote_url=body.remote_url,
git_fingerprint=body.git_fingerprint,
description=body.description,
topic_id=body.topic_id,
)
session.add(repo)
await session.commit()
await session.refresh(repo)
return repo
@router.get("/by-fingerprint", response_model=list[RepoRead])
async def get_repo_by_fingerprint(
hash: str,
remote_url: str | None = None,
session: AsyncSession = Depends(get_session),
) -> list[ManagedRepo]:
"""Look up repos by git root-commit SHA-1 fingerprint.
The fingerprint is the output of ``git rev-list --max-parents=0 HEAD`` and
is identical across every clone of the same repository. Repos that share
git history (forks, monorepo splits) will have the same fingerprint.
Pass ``remote_url`` to narrow results to a specific remote — useful when
multiple repos share the same ancestor commit.
Returns an empty list if no match is found.
"""
q = select(ManagedRepo).where(ManagedRepo.git_fingerprint == hash)
if remote_url:
q = q.where(ManagedRepo.remote_url == remote_url)
result = await session.execute(q)
return list(result.scalars().all())
@router.get("/by-remote", response_model=RepoRead)
async def get_repo_by_remote_url(
url: str,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
"""Look up a repo by its git remote URL (fallback; prefer /by-fingerprint)."""
result = await session.execute(select(ManagedRepo).where(ManagedRepo.remote_url == url))
repo = result.scalar_one_or_none()
if repo is None:
raise HTTPException(status_code=404, detail=f"No repo with remote_url '{url}' found")
return repo
@router.get("/doi/summary", response_model=list[DoISummaryEntry])
async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]:
"""Return DoI tier for all active repos, worst tier first.
Results are cached in doi_cache. A repo is only re-evaluated when its
fingerprint changes (repo record updated, new TPSC snapshot, goal change,
or a key file mtime changes on disk).
"""
repos_result = await session.execute(
select(ManagedRepo).where(ManagedRepo.status == "active").order_by(ManagedRepo.name)
)
repos = list(repos_result.scalars().all())
repo_ids = [r.id for r in repos]
id_to_slug = {r.id: r.slug for r in repos}
# ── Bulk DB queries for fingerprint inputs ────────────────────────────────
domains_result = await session.execute(select(Domain))
domain_obj_map = {d.id: d for d in domains_result.scalars().all()}
domain_map = {d.id: d.slug for d in domain_obj_map.values()}
domain_status = {d.slug: d.status for d in domain_obj_map.values()}
# Latest TPSC snapshot timestamp per repo (for fingerprint + C9 count)
tpsc_result = await session.execute(
select(TPSCSnapshot.repo_id,
func.count().label("cnt"),
func.max(TPSCSnapshot.snapshot_at).label("latest"))
.where(TPSCSnapshot.repo_id.in_(repo_ids))
.group_by(TPSCSnapshot.repo_id)
)
tpsc_by_id = {row.repo_id: row for row in tpsc_result}
tpsc_snap_counts = {id_to_slug[rid]: row.cnt for rid, row in tpsc_by_id.items() if rid in id_to_slug}
tpsc_snap_latest = {id_to_slug[rid]: str(row.latest) for rid, row in tpsc_by_id.items() if rid in id_to_slug}
# Latest goal updated_at + active count per repo (for fingerprint + C10)
goals_result = await session.execute(
select(RepoGoal.repo_id,
func.count().label("total"),
func.sum(case((RepoGoal.status == "active", 1), else_=0)).label("active_cnt"),
func.max(RepoGoal.updated_at).label("latest"))
.where(RepoGoal.repo_id.in_(repo_ids))
.group_by(RepoGoal.repo_id)
)
goals_by_id = {row.repo_id: row for row in goals_result}
active_goal_counts = {id_to_slug[rid]: int(row.active_cnt or 0) for rid, row in goals_by_id.items() if rid in id_to_slug}
goals_latest = {id_to_slug[rid]: str(row.latest) for rid, row in goals_by_id.items() if rid in id_to_slug}
# Load existing cache rows
cache_result = await session.execute(
select(DOICache).where(DOICache.repo_id.in_(repo_ids))
)
cache_by_repo_id = {row.repo_id: row for row in cache_result.scalars().all()}
# ─────────────────────────────────────────────────────────────────────────
prefetch = {
"domain_status": domain_status,
"tpsc_snap_counts": tpsc_snap_counts,
"active_goal_counts": active_goal_counts,
}
async def _get_or_refresh(repo: ManagedRepo) -> DoISummaryEntry:
slug = repo.slug
repo_dict = {
"slug": slug,
"domain_slug": domain_map.get(repo.domain_id),
"local_path": repo.local_path,
"remote_url": repo.remote_url,
"host_paths": repo.host_paths or {},
"last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
"updated_at": str(repo.updated_at) if repo.updated_at else "",
}
fp = compute_fingerprint(
repo_dict,
tpsc_snap_latest.get(slug),
goals_latest.get(slug),
)
cached = cache_by_repo_id.get(repo.id)
if cached and cached.fingerprint == fp:
# Cache hit — return stored result
return DoISummaryEntry(
repo_slug=slug,
domain_slug=domain_map.get(repo.domain_id),
tier=cached.tier,
core_pass=cached.core_pass,
standard_pass=cached.standard_pass,
full_pass=cached.full_pass,
checked_at=cached.checked_at.isoformat(),
)
# Cache miss — evaluate and store
report = await _doi_evaluate(repo_dict, skip_consistency=True, prefetch=prefetch)
now = datetime.now(tz=timezone.utc)
if cached:
cached.tier = report.tier
cached.core_pass = report.core_pass
cached.standard_pass = report.standard_pass
cached.full_pass = report.full_pass
cached.criteria = [{"id": c.id, "label": c.label, "tier": c.tier,
"status": c.status, "detail": c.detail}
for c in report.criteria]
cached.fingerprint = fp
cached.checked_at = now
cached.updated_at = now
else:
session.add(DOICache(
repo_id=repo.id,
tier=report.tier,
core_pass=report.core_pass,
standard_pass=report.standard_pass,
full_pass=report.full_pass,
criteria=[{"id": c.id, "label": c.label, "tier": c.tier,
"status": c.status, "detail": c.detail}
for c in report.criteria],
fingerprint=fp,
checked_at=now,
updated_at=now,
))
return DoISummaryEntry(
repo_slug=slug,
domain_slug=domain_map.get(repo.domain_id),
tier=report.tier,
core_pass=report.core_pass,
standard_pass=report.standard_pass,
full_pass=report.full_pass,
checked_at=now.isoformat(),
)
entries: list[DoISummaryEntry] = list(await asyncio.gather(*[_get_or_refresh(r) for r in repos]))
await session.commit()
tier_order = {"none": 0, "core": 1, "standard": 2, "full": 3}
entries.sort(key=lambda e: tier_order.get(e.tier, 0))
return entries
@router.get("/{slug}/doi", response_model=DoIReport)
async def get_repo_doi(
slug: str,
force_refresh: bool = False,
session: AsyncSession = Depends(get_session),
) -> DoIReport:
"""Evaluate the 14 DoI criteria for a single repo (full check including C7/C13).
Results are cached by fingerprint. Pass ?force_refresh=true to bypass the cache.
"""
repo = await _get_repo_by_slug(slug, session)
domain_result = await session.execute(select(Domain).where(Domain.id == repo.domain_id))
domain_obj = domain_result.scalar_one_or_none()
# Fingerprint inputs for this single repo
tpsc_row = (await session.execute(
select(func.count().label("cnt"), func.max(TPSCSnapshot.snapshot_at).label("latest"))
.where(TPSCSnapshot.repo_id == repo.id)
)).one()
goal_row = (await session.execute(
select(func.max(RepoGoal.updated_at).label("latest"))
.where(RepoGoal.repo_id == repo.id)
)).one()
repo_dict = {
"slug": repo.slug,
"domain_slug": domain_obj.slug if domain_obj else None,
"local_path": repo.local_path,
"remote_url": repo.remote_url,
"host_paths": repo.host_paths or {},
"last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
"updated_at": str(repo.updated_at) if repo.updated_at else "",
}
fp = compute_fingerprint(repo_dict, str(tpsc_row.latest) if tpsc_row.latest else None,
str(goal_row.latest) if goal_row.latest else None)
# Check cache (unless force_refresh)
cached = (await session.execute(
select(DOICache).where(DOICache.repo_id == repo.id)
)).scalar_one_or_none()
if not force_refresh and cached and cached.fingerprint == fp and cached.criteria:
return DoIReport(
repo_slug=slug,
tier=cached.tier,
core_pass=cached.core_pass,
standard_pass=cached.standard_pass,
full_pass=cached.full_pass,
checked_at=cached.checked_at.isoformat(),
criteria=[DoICriterion(**c) for c in cached.criteria],
)
# Full evaluation (includes C7/C13 consistency subprocesses)
report = await _doi_evaluate(repo_dict)
now = datetime.now(tz=timezone.utc)
criteria_json = [{"id": c.id, "label": c.label, "tier": c.tier,
"status": c.status, "detail": c.detail} for c in report.criteria]
if cached:
cached.tier = report.tier; cached.core_pass = report.core_pass
cached.standard_pass = report.standard_pass; cached.full_pass = report.full_pass
cached.criteria = criteria_json; cached.fingerprint = fp
cached.checked_at = now; cached.updated_at = now
else:
session.add(DOICache(repo_id=repo.id, tier=report.tier,
core_pass=report.core_pass, standard_pass=report.standard_pass,
full_pass=report.full_pass, criteria=criteria_json,
fingerprint=fp, checked_at=now, updated_at=now))
await session.commit()
return DoIReport(
repo_slug=report.repo_slug, tier=report.tier,
core_pass=report.core_pass, standard_pass=report.standard_pass,
full_pass=report.full_pass, checked_at=report.checked_at,
criteria=[DoICriterion(id=c.id, label=c.label, tier=c.tier,
status=c.status, detail=c.detail) for c in report.criteria],
)
@router.get("/{slug}/", response_model=RepoRead)
async def get_repo(
slug: str,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
return await _get_repo_by_slug(slug, session)
@router.patch("/{slug}/", response_model=RepoRead)
async def update_repo(
slug: str,
body: RepoUpdate,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
repo = await _get_repo_by_slug(slug, session)
for field, value in body.model_dump(exclude_unset=True).items():
setattr(repo, field, value)
await session.commit()
await session.refresh(repo)
return repo
@router.post("/{slug}/paths/", response_model=RepoRead)
async def register_host_path(
slug: str,
body: RepoPathRegister,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
"""Register or update the local path for a specific host.
Merges {"host": path} into host_paths without overwriting other entries.
Use this when a repo lives at a different absolute path on different machines.
"""
repo = await _get_repo_by_slug(slug, session)
updated = dict(repo.host_paths or {})
updated[body.host] = body.path
repo.host_paths = updated
await session.commit()
await session.refresh(repo)
return repo
@router.patch("/{slug}/archive", response_model=RepoRead)
async def archive_repo(
slug: str,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
repo = await _get_repo_by_slug(slug, session)
repo.status = "archived"
await session.commit()
await session.refresh(repo)
return repo
@router.get("/{slug}/dispatch", response_model=RepoDispatch)
async def get_repo_dispatch(
slug: str,
session: AsyncSession = Depends(get_session),
) -> RepoDispatch:
"""Return active workstreams, pending tasks, and goal for a repo.
This endpoint is the foundation for autonomous agent sessions: an agent can
call it at session start to discover what work is pending without needing to
read state-hub summary or scan workplan files manually.
"""
repo = await _get_repo_by_slug(slug, session)
# Active goal
goal_result = await session.execute(
select(RepoGoal)
.where(RepoGoal.repo_id == repo.id, RepoGoal.status == "active")
.order_by(RepoGoal.priority)
.limit(1)
)
goal_obj = goal_result.scalar_one_or_none()
active_goal = None
if goal_obj:
active_goal = {
"id": str(goal_obj.id),
"title": goal_obj.title,
"description": goal_obj.description,
"priority": goal_obj.priority,
}
# Active workstreams
ws_result = await session.execute(
select(Workstream)
.where(Workstream.repo_id == repo.id, Workstream.status == "active")
.order_by(Workstream.created_at)
)
workstreams = list(ws_result.scalars().all())
dispatch_workstreams: list[DispatchWorkstream] = []
all_interventions: list[DispatchTask] = []
for ws in workstreams:
task_result = await session.execute(
select(Task)
.where(Task.workstream_id == ws.id, Task.status.in_(["todo", "in_progress"]))
.order_by(Task.created_at)
)
tasks = list(task_result.scalars().all())
pending = [
DispatchTask(
id=t.id,
title=t.title,
priority=t.priority,
status=t.status,
needs_human=t.needs_human,
)
for t in tasks
]
interventions = [t for t in pending if t.needs_human]
all_interventions.extend(interventions)
dispatch_workstreams.append(
DispatchWorkstream(
id=ws.id,
title=ws.title,
status=ws.status,
pending_tasks=pending,
)
)
return RepoDispatch(
repo_slug=slug,
active_goal=active_goal,
active_workstreams=dispatch_workstreams,
human_interventions=all_interventions,
last_state_synced_at=repo.last_state_synced_at,
)
async def _get_repo_by_slug(slug: str, session: AsyncSession) -> ManagedRepo:
result = await session.execute(select(ManagedRepo).where(ManagedRepo.slug == slug))
repo = result.scalar_one_or_none()
if repo is None:
raise HTTPException(status_code=404, detail=f"Repo '{slug}' not found")
return repo