Files
state-hub/api/routers/repos.py

729 lines
26 KiB
Python

import asyncio
import json
import os
import re
import socket
import subprocess
import sys
import uuid
from datetime import datetime, timezone
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, Response, status
from sqlalchemy import case, func, select
from sqlalchemy.orm import noload
from sqlalchemy.ext.asyncio import AsyncSession
from api.config import settings
from api.database import get_session
from api.events import EventEnvelope, publish_event
from api.doi_engine import (
compute_fingerprint,
evaluate as _doi_evaluate,
evaluate_scope_health,
resolve_repo_path,
)
from api.models.doi_cache import DOICache
from api.models.domain import Domain
from api.models.interface_change import InterfaceChange
from api.models.managed_repo import ManagedRepo
from api.models.repo_goal import RepoGoal
from api.models.tpsc import TPSCSnapshot
from api.models.task import Task
from api.models.workstream import Workstream
from api.schemas.doi import DoICriterion, DoIReport, DoISummaryEntry
from api.schemas.managed_repo import (
DispatchTask,
DispatchWorkstream,
PendingInterfaceChange,
RepoCreate,
RepoDispatch,
RepoOnboardRequest,
RepoOnboardResult,
RepoPathRegister,
RepoRead,
RepoScopeHealth,
RepoUpdate,
ScopeIssueDetail,
)
router = APIRouter(prefix="/repos", tags=["repos"])
@router.get("/", response_model=list[RepoRead])
async def list_repos(
response: Response,
domain: str | None = None,
session: AsyncSession = Depends(get_session),
) -> list[ManagedRepo]:
response.headers["Cache-Control"] = "max-age=60, stale-while-revalidate=30"
q = select(ManagedRepo).options(noload(ManagedRepo.goals)).order_by(ManagedRepo.name)
if domain:
domain_row = await session.execute(select(Domain).where(Domain.slug == domain))
domain_obj = domain_row.scalar_one_or_none()
if domain_obj is None:
raise HTTPException(status_code=404, detail=f"Domain '{domain}' not found")
q = q.where(ManagedRepo.domain_id == domain_obj.id)
result = await session.execute(q)
return list(result.scalars().all())
@router.post("/", response_model=RepoRead, status_code=status.HTTP_201_CREATED)
async def register_repo(
body: RepoCreate,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
domain_row = await session.execute(select(Domain).where(Domain.slug == body.domain_slug))
domain_obj = domain_row.scalar_one_or_none()
if domain_obj is None:
raise HTTPException(status_code=404, detail=f"Domain '{body.domain_slug}' not found")
existing = await session.execute(select(ManagedRepo).where(ManagedRepo.slug == body.slug))
if existing.scalar_one_or_none():
raise HTTPException(status_code=409, detail=f"Repo slug '{body.slug}' already exists")
repo = ManagedRepo(
domain_id=domain_obj.id,
slug=body.slug,
name=body.name,
local_path=body.local_path,
remote_url=body.remote_url,
git_fingerprint=body.git_fingerprint,
description=body.description,
topic_id=body.topic_id,
)
session.add(repo)
await session.commit()
await session.refresh(repo)
subject = "org.statehub.repo.registered"
envelope = EventEnvelope.new(
subject,
attributes={
"repo_id": str(repo.id),
"repo_slug": repo.slug,
"domain_slug": body.domain_slug,
"remote_url": repo.remote_url,
"local_path": repo.local_path,
},
)
asyncio.create_task(publish_event(subject, envelope))
return repo
@router.post("/onboard", response_model=RepoOnboardResult)
async def onboard_repo(body: RepoOnboardRequest) -> RepoOnboardResult:
"""Run the local repo onboarding script for an accessible working copy.
The dashboard uses this for the "Add Repo" action. The path must be visible
from the State Hub host, either as a local checkout or through an ops-bridge
mounted/exposed working copy. Keep the API agent-profile based so future
native coding agents can gain their own profiles without changing callers.
"""
project_path = Path(body.project_path).expanduser()
if not project_path.exists() or not project_path.is_dir():
raise HTTPException(
status_code=400,
detail=f"project_path is not an accessible directory: {body.project_path}",
)
if not (project_path / ".git").exists():
raise HTTPException(
status_code=400,
detail=f"project_path does not look like a git working copy: {body.project_path}",
)
script = Path(__file__).parent.parent.parent / "scripts" / "register_project.sh"
cmd = ["bash", str(script), body.domain_slug, str(project_path)]
if body.agent_profile == "codex":
cmd.append("--codex")
if body.additional:
cmd.append("--additional")
env = {
**os.environ,
"API_BASE": settings.api_base,
"CUSTODIAN_SKIP_SBOM_PROMPT": "true",
}
result = await asyncio.to_thread(
subprocess.run,
cmd,
cwd=str(script.parent.parent),
env=env,
stdin=subprocess.DEVNULL,
capture_output=True,
text=True,
timeout=180,
)
stdout = result.stdout or ""
stderr = result.stderr or ""
if result.returncode != 0:
raise HTTPException(
status_code=500,
detail={
"message": "Repo onboarding failed.",
"command": cmd,
"stdout": stdout,
"stderr": stderr,
},
)
repo_slug = None
match = re.search(r"Repo slug:\s+([a-z0-9][a-z0-9-]*)", stdout)
if match:
repo_slug = match.group(1)
return RepoOnboardResult(
ok=True,
repo_slug=repo_slug,
agent_profile=body.agent_profile,
command=cmd,
stdout=stdout,
stderr=stderr,
)
@router.get("/by-fingerprint", response_model=list[RepoRead])
async def get_repo_by_fingerprint(
hash: str,
remote_url: str | None = None,
session: AsyncSession = Depends(get_session),
) -> list[ManagedRepo]:
"""Look up repos by git root-commit SHA-1 fingerprint.
The fingerprint is the output of ``git rev-list --max-parents=0 HEAD`` and
is identical across every clone of the same repository. Repos that share
git history (forks, monorepo splits) will have the same fingerprint.
Pass ``remote_url`` to narrow results to a specific remote — useful when
multiple repos share the same ancestor commit.
Returns an empty list if no match is found.
"""
q = select(ManagedRepo).where(ManagedRepo.git_fingerprint == hash)
if remote_url:
q = q.where(ManagedRepo.remote_url == remote_url)
result = await session.execute(q)
return list(result.scalars().all())
@router.get("/by-remote", response_model=RepoRead)
async def get_repo_by_remote_url(
url: str,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
"""Look up a repo by its git remote URL (fallback; prefer /by-fingerprint)."""
result = await session.execute(select(ManagedRepo).where(ManagedRepo.remote_url == url))
repo = result.scalar_one_or_none()
if repo is None:
raise HTTPException(status_code=404, detail=f"No repo with remote_url '{url}' found")
return repo
@router.get("/doi/summary", response_model=list[DoISummaryEntry])
async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]:
"""Return DoI tier for all active repos, worst tier first.
Results are cached in doi_cache. A repo is only re-evaluated when its
fingerprint changes (repo record updated, new TPSC snapshot, goal change,
or a key file mtime changes on disk).
"""
repos_result = await session.execute(
select(ManagedRepo).where(ManagedRepo.status == "active").order_by(ManagedRepo.name)
)
repos = list(repos_result.scalars().all())
repo_ids = [r.id for r in repos]
id_to_slug = {r.id: r.slug for r in repos}
# ── Bulk DB queries for fingerprint inputs ────────────────────────────────
domains_result = await session.execute(select(Domain))
domain_obj_map = {d.id: d for d in domains_result.scalars().all()}
domain_map = {d.id: d.slug for d in domain_obj_map.values()}
domain_status = {d.slug: d.status for d in domain_obj_map.values()}
# Latest TPSC snapshot timestamp per repo (for fingerprint + C9 count)
tpsc_result = await session.execute(
select(TPSCSnapshot.repo_id,
func.count().label("cnt"),
func.max(TPSCSnapshot.snapshot_at).label("latest"))
.where(TPSCSnapshot.repo_id.in_(repo_ids))
.group_by(TPSCSnapshot.repo_id)
)
tpsc_by_id = {row.repo_id: row for row in tpsc_result}
tpsc_snap_counts = {id_to_slug[rid]: row.cnt for rid, row in tpsc_by_id.items() if rid in id_to_slug}
tpsc_snap_latest = {id_to_slug[rid]: str(row.latest) for rid, row in tpsc_by_id.items() if rid in id_to_slug}
# Latest goal updated_at + active count per repo (for fingerprint + C10)
goals_result = await session.execute(
select(RepoGoal.repo_id,
func.count().label("total"),
func.sum(case((RepoGoal.status == "active", 1), else_=0)).label("active_cnt"),
func.max(RepoGoal.updated_at).label("latest"))
.where(RepoGoal.repo_id.in_(repo_ids))
.group_by(RepoGoal.repo_id)
)
goals_by_id = {row.repo_id: row for row in goals_result}
active_goal_counts = {id_to_slug[rid]: int(row.active_cnt or 0) for rid, row in goals_by_id.items() if rid in id_to_slug}
goals_latest = {id_to_slug[rid]: str(row.latest) for rid, row in goals_by_id.items() if rid in id_to_slug}
# Load existing cache rows
cache_result = await session.execute(
select(DOICache).where(DOICache.repo_id.in_(repo_ids))
)
cache_by_repo_id = {row.repo_id: row for row in cache_result.scalars().all()}
# ─────────────────────────────────────────────────────────────────────────
prefetch = {
"domain_status": domain_status,
"tpsc_snap_counts": tpsc_snap_counts,
"active_goal_counts": active_goal_counts,
}
async def _get_or_refresh(repo: ManagedRepo) -> DoISummaryEntry:
slug = repo.slug
repo_dict = {
"slug": slug,
"domain_slug": domain_map.get(repo.domain_id),
"local_path": repo.local_path,
"remote_url": repo.remote_url,
"host_paths": repo.host_paths or {},
"last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
"updated_at": str(repo.updated_at) if repo.updated_at else "",
}
fp = compute_fingerprint(
repo_dict,
tpsc_snap_latest.get(slug),
goals_latest.get(slug),
)
cached = cache_by_repo_id.get(repo.id)
if cached and cached.fingerprint == fp:
# Cache hit — return stored result
return DoISummaryEntry(
repo_slug=slug,
domain_slug=domain_map.get(repo.domain_id),
tier=cached.tier,
core_pass=cached.core_pass,
standard_pass=cached.standard_pass,
full_pass=cached.full_pass,
checked_at=cached.checked_at.isoformat(),
)
# Cache miss — evaluate and store
report = await _doi_evaluate(repo_dict, skip_consistency=True, prefetch=prefetch)
now = datetime.now(tz=timezone.utc)
if cached:
cached.tier = report.tier
cached.core_pass = report.core_pass
cached.standard_pass = report.standard_pass
cached.full_pass = report.full_pass
cached.criteria = [{"id": c.id, "label": c.label, "tier": c.tier,
"status": c.status, "detail": c.detail}
for c in report.criteria]
cached.fingerprint = fp
cached.checked_at = now
cached.updated_at = now
else:
session.add(DOICache(
repo_id=repo.id,
tier=report.tier,
core_pass=report.core_pass,
standard_pass=report.standard_pass,
full_pass=report.full_pass,
criteria=[{"id": c.id, "label": c.label, "tier": c.tier,
"status": c.status, "detail": c.detail}
for c in report.criteria],
fingerprint=fp,
checked_at=now,
updated_at=now,
))
return DoISummaryEntry(
repo_slug=slug,
domain_slug=domain_map.get(repo.domain_id),
tier=report.tier,
core_pass=report.core_pass,
standard_pass=report.standard_pass,
full_pass=report.full_pass,
checked_at=now.isoformat(),
)
entries: list[DoISummaryEntry] = list(await asyncio.gather(*[_get_or_refresh(r) for r in repos]))
await session.commit()
tier_order = {"none": 0, "core": 1, "standard": 2, "full": 3}
entries.sort(key=lambda e: tier_order.get(e.tier, 0))
return entries
@router.get("/{slug}/doi", response_model=DoIReport)
async def get_repo_doi(
slug: str,
force_refresh: bool = False,
session: AsyncSession = Depends(get_session),
) -> DoIReport:
"""Evaluate the 14 DoI criteria for a single repo (full check including C7/C13).
Results are cached by fingerprint. Pass ?force_refresh=true to bypass the cache.
"""
repo = await _get_repo_by_slug(slug, session)
domain_result = await session.execute(select(Domain).where(Domain.id == repo.domain_id))
domain_obj = domain_result.scalar_one_or_none()
# Fingerprint inputs for this single repo
tpsc_row = (await session.execute(
select(func.count().label("cnt"), func.max(TPSCSnapshot.snapshot_at).label("latest"))
.where(TPSCSnapshot.repo_id == repo.id)
)).one()
goal_row = (await session.execute(
select(func.max(RepoGoal.updated_at).label("latest"))
.where(RepoGoal.repo_id == repo.id)
)).one()
repo_dict = {
"slug": repo.slug,
"domain_slug": domain_obj.slug if domain_obj else None,
"local_path": repo.local_path,
"remote_url": repo.remote_url,
"host_paths": repo.host_paths or {},
"last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
"updated_at": str(repo.updated_at) if repo.updated_at else "",
}
fp = compute_fingerprint(repo_dict, str(tpsc_row.latest) if tpsc_row.latest else None,
str(goal_row.latest) if goal_row.latest else None)
# Check cache (unless force_refresh)
cached = (await session.execute(
select(DOICache).where(DOICache.repo_id == repo.id)
)).scalar_one_or_none()
if not force_refresh and cached and cached.fingerprint == fp and cached.criteria:
return DoIReport(
repo_slug=slug,
tier=cached.tier,
core_pass=cached.core_pass,
standard_pass=cached.standard_pass,
full_pass=cached.full_pass,
checked_at=cached.checked_at.isoformat(),
criteria=[DoICriterion(**c) for c in cached.criteria],
)
# Full evaluation (includes C7/C13 consistency subprocesses)
report = await _doi_evaluate(repo_dict)
now = datetime.now(tz=timezone.utc)
criteria_json = [{"id": c.id, "label": c.label, "tier": c.tier,
"status": c.status, "detail": c.detail} for c in report.criteria]
if cached:
cached.tier = report.tier; cached.core_pass = report.core_pass
cached.standard_pass = report.standard_pass; cached.full_pass = report.full_pass
cached.criteria = criteria_json; cached.fingerprint = fp
cached.checked_at = now; cached.updated_at = now
else:
session.add(DOICache(repo_id=repo.id, tier=report.tier,
core_pass=report.core_pass, standard_pass=report.standard_pass,
full_pass=report.full_pass, criteria=criteria_json,
fingerprint=fp, checked_at=now, updated_at=now))
await session.commit()
return DoIReport(
repo_slug=report.repo_slug, tier=report.tier,
core_pass=report.core_pass, standard_pass=report.standard_pass,
full_pass=report.full_pass, checked_at=report.checked_at,
criteria=[DoICriterion(id=c.id, label=c.label, tier=c.tier,
status=c.status, detail=c.detail) for c in report.criteria],
)
@router.get("/by-id/{repo_id}", response_model=RepoRead)
async def get_repo_by_id(
repo_id: uuid.UUID,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
repo = await session.get(ManagedRepo, repo_id)
if repo is None:
raise HTTPException(status_code=404, detail=f"Repo '{repo_id}' not found")
return repo
@router.get("/scope-health", response_model=list[RepoScopeHealth])
async def list_repo_scope_health(
needs_review: bool | None = None,
reachable_only: bool = False,
session: AsyncSession = Depends(get_session),
) -> list[RepoScopeHealth]:
"""Return machine-readable SCOPE.md health for active repos.
Repo-scoping uses this to refresh only repos and SCOPE.md sections that
need attention, without guessing from free-text DoI output.
"""
result = await session.execute(
select(ManagedRepo, Domain.slug)
.join(Domain, Domain.id == ManagedRepo.domain_id)
.where(ManagedRepo.status == "active")
.order_by(ManagedRepo.slug)
)
entries: list[RepoScopeHealth] = []
for repo, domain_slug in result.all():
repo_dict = _repo_doi_dict(repo, domain_slug)
resolved_path = resolve_repo_path(repo_dict)
scope_issue_details = [
ScopeIssueDetail(**issue)
for issue in evaluate_scope_health(repo_dict)
]
scope_needs_review = any(
issue.id in {"C5a", "C5b", "C5c"} and issue.status in {"fail", "warn"}
for issue in scope_issue_details
)
entry = RepoScopeHealth(
repo_slug=repo.slug,
domain_slug=domain_slug,
local_path=resolved_path or repo.local_path,
path_available=bool(resolved_path),
scope_needs_review=scope_needs_review,
scope_issue_details=scope_issue_details,
)
if needs_review is not None and entry.scope_needs_review != needs_review:
continue
if reachable_only and not entry.path_available:
continue
entries.append(entry)
return entries
@router.get("/{slug}", response_model=RepoRead)
async def get_repo(
slug: str,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
return await _get_repo_by_slug(slug, session)
@router.patch("/{slug}", response_model=RepoRead)
async def update_repo(
slug: str,
body: RepoUpdate,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
repo = await _get_repo_by_slug(slug, session)
for field, value in body.model_dump(exclude_unset=True).items():
setattr(repo, field, value)
await session.commit()
await session.refresh(repo)
return repo
@router.post("/{slug}/paths", response_model=RepoRead)
async def register_host_path(
slug: str,
body: RepoPathRegister,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
"""Register or update the local path for a specific host.
Merges {"host": path} into host_paths without overwriting other entries.
Use this when a repo lives at a different absolute path on different machines.
"""
repo = await _get_repo_by_slug(slug, session)
updated = dict(repo.host_paths or {})
updated[body.host] = body.path
repo.host_paths = updated
await session.commit()
await session.refresh(repo)
return repo
@router.patch("/{slug}/archive", response_model=RepoRead)
async def archive_repo(
slug: str,
session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
repo = await _get_repo_by_slug(slug, session)
repo.status = "archived"
await session.commit()
await session.refresh(repo)
return repo
@router.get("/{slug}/dispatch", response_model=RepoDispatch)
async def get_repo_dispatch(
slug: str,
session: AsyncSession = Depends(get_session),
) -> RepoDispatch:
"""Return active workstreams, pending tasks, and goal for a repo.
This endpoint is the foundation for autonomous agent sessions: an agent can
call it at session start to discover what work is pending without needing to
read state-hub summary or scan workplan files manually.
"""
repo = await _get_repo_by_slug(slug, session)
# Active goal
goal_result = await session.execute(
select(RepoGoal)
.where(RepoGoal.repo_id == repo.id, RepoGoal.status == "active")
.order_by(RepoGoal.priority)
.limit(1)
)
goal_obj = goal_result.scalar_one_or_none()
active_goal = None
if goal_obj:
active_goal = {
"id": str(goal_obj.id),
"title": goal_obj.title,
"description": goal_obj.description,
"priority": goal_obj.priority,
}
# Active workstreams
ws_result = await session.execute(
select(Workstream)
.where(Workstream.repo_id == repo.id, Workstream.status == "active")
.order_by(Workstream.created_at)
)
workstreams = list(ws_result.scalars().all())
dispatch_workstreams: list[DispatchWorkstream] = []
all_interventions: list[DispatchTask] = []
for ws in workstreams:
task_result = await session.execute(
select(Task)
.where(Task.workstream_id == ws.id, Task.status.in_(["todo", "progress"]))
.order_by(Task.created_at)
)
tasks = list(task_result.scalars().all())
pending = [
DispatchTask(
id=t.id,
title=t.title,
priority=t.priority,
status=t.status,
needs_human=t.needs_human,
)
for t in tasks
]
interventions = [t for t in pending if t.needs_human]
all_interventions.extend(interventions)
dispatch_workstreams.append(
DispatchWorkstream(
id=ws.id,
title=ws.title,
status=ws.status,
pending_tasks=pending,
)
)
# Published interface changes that affect this repo and are not yet resolved
ic_result = await session.execute(
select(InterfaceChange).where(
InterfaceChange.status == "published",
InterfaceChange.affected_repo_slugs.contains([slug]),
).order_by(InterfaceChange.published_at.desc())
)
pending_changes = [
PendingInterfaceChange(
id=ic.id,
title=ic.title,
change_type=ic.change_type,
interface_type=ic.interface_type,
origin_repo_slug=ic.repo.slug,
affected_paths=ic.affected_paths or [],
planned_for=ic.planned_for,
published_at=ic.published_at,
)
for ic in ic_result.scalars().all()
]
domain_result = await session.execute(select(Domain).where(Domain.id == repo.domain_id))
domain_obj = domain_result.scalar_one_or_none()
scope_issue_details = [
ScopeIssueDetail(**issue)
for issue in evaluate_scope_health(_repo_doi_dict(repo, domain_obj.slug if domain_obj else None))
]
scope_needs_review = any(
issue.id in {"C5a", "C5b", "C5c"} and issue.status in {"fail", "warn"}
for issue in scope_issue_details
)
return RepoDispatch(
repo_slug=slug,
active_goal=active_goal,
active_workstreams=dispatch_workstreams,
human_interventions=all_interventions,
pending_interface_changes=pending_changes,
scope_needs_review=scope_needs_review,
scope_issue_details=scope_issue_details,
last_state_synced_at=repo.last_state_synced_at,
)
@router.post("/{slug}/sync")
async def sync_repo_consistency(
slug: str,
fix: bool = True,
session: AsyncSession = Depends(get_session),
) -> dict:
"""Run ADR-001 consistency check (and optional --fix) for a repo via HTTP.
Intended for non-Claude-Code agents (e.g. Codex) that cannot use MCP tools
but need to sync workplan file state to the state-hub DB after making changes.
Returns the raw JSON output from consistency_check.py.
Query param ?fix=false to run check-only without writing.
"""
repo = await _get_repo_by_slug(slug, session)
hostname = socket.gethostname()
host_paths = repo.host_paths or {}
repo_path = host_paths.get(hostname)
if not repo_path or not Path(repo_path).exists():
raise HTTPException(
status_code=503,
detail=(
f"No accessible path for repo '{slug}' on host '{hostname}'. "
f"Register with: POST /repos/{slug}/paths/"
),
)
script = Path(__file__).parent.parent.parent / "scripts" / "consistency_check.py"
cmd = [sys.executable, str(script), "--repo", slug, "--json",
"--api-base", settings.api_base]
if fix:
cmd.append("--fix")
result = await asyncio.to_thread(
subprocess.run, cmd, capture_output=True, text=True
)
try:
return json.loads(result.stdout)
except Exception:
raise HTTPException(
status_code=500,
detail=f"Consistency check failed: {result.stderr or result.stdout or '(no output)'}",
)
async def _get_repo_by_slug(slug: str, session: AsyncSession) -> ManagedRepo:
result = await session.execute(select(ManagedRepo).where(ManagedRepo.slug == slug))
repo = result.scalar_one_or_none()
if repo is None:
raise HTTPException(status_code=404, detail=f"Repo '{slug}' not found")
return repo
def _repo_doi_dict(repo: ManagedRepo, domain_slug: str | None) -> dict:
return {
"slug": repo.slug,
"domain_slug": domain_slug,
"local_path": repo.local_path,
"remote_url": repo.remote_url,
"host_paths": repo.host_paths or {},
"last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
"updated_at": str(repo.updated_at) if repo.updated_at else "",
}