state-hub/api/routers/repos.py

import asyncio
import uuid
from datetime import datetime, timezone

from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy import case, func, select
from sqlalchemy.ext.asyncio import AsyncSession

from api.database import get_session
from api.doi_engine import compute_fingerprint, evaluate as _doi_evaluate
from api.models.doi_cache import DOICache
from api.models.domain import Domain
from api.models.managed_repo import ManagedRepo
from api.models.repo_goal import RepoGoal
from api.models.tpsc import TPSCSnapshot
from api.models.task import Task
from api.models.workstream import Workstream
from api.schemas.doi import DoICriterion, DoIReport, DoISummaryEntry
from api.schemas.managed_repo import (
    DispatchTask,
    DispatchWorkstream,
    RepoCreate,
    RepoDispatch,
    RepoPathRegister,
    RepoRead,
    RepoUpdate,
)

router = APIRouter(prefix="/repos", tags=["repos"])


@router.get("/", response_model=list[RepoRead])
async def list_repos(
    domain: str | None = None,
    session: AsyncSession = Depends(get_session),
) -> list[ManagedRepo]:
    q = select(ManagedRepo).order_by(ManagedRepo.name)
    if domain:
        domain_row = await session.execute(select(Domain).where(Domain.slug == domain))
        domain_obj = domain_row.scalar_one_or_none()
        if domain_obj is None:
            raise HTTPException(status_code=404, detail=f"Domain '{domain}' not found")
        q = q.where(ManagedRepo.domain_id == domain_obj.id)
    result = await session.execute(q)
    return list(result.scalars().all())


@router.post("/", response_model=RepoRead, status_code=status.HTTP_201_CREATED)
async def register_repo(
    body: RepoCreate,
    session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
    domain_row = await session.execute(select(Domain).where(Domain.slug == body.domain_slug))
    domain_obj = domain_row.scalar_one_or_none()
    if domain_obj is None:
        raise HTTPException(status_code=404, detail=f"Domain '{body.domain_slug}' not found")

    existing = await session.execute(select(ManagedRepo).where(ManagedRepo.slug == body.slug))
    if existing.scalar_one_or_none():
        raise HTTPException(status_code=409, detail=f"Repo slug '{body.slug}' already exists")

    repo = ManagedRepo(
        domain_id=domain_obj.id,
        slug=body.slug,
        name=body.name,
        local_path=body.local_path,
        remote_url=body.remote_url,
        git_fingerprint=body.git_fingerprint,
        description=body.description,
        topic_id=body.topic_id,
    )
    session.add(repo)
    await session.commit()
    await session.refresh(repo)
    return repo


@router.get("/by-fingerprint", response_model=list[RepoRead])
async def get_repo_by_fingerprint(
    hash: str,
    remote_url: str | None = None,
    session: AsyncSession = Depends(get_session),
) -> list[ManagedRepo]:
    """Look up repos by git root-commit SHA-1 fingerprint.

    The fingerprint is the output of ``git rev-list --max-parents=0 HEAD`` and
    is identical across every clone of the same repository.  Repos that share
    git history (forks, monorepo splits) will have the same fingerprint.

    Pass ``remote_url`` to narrow results to a specific remote — useful when
    multiple repos share the same ancestor commit.

    Returns an empty list if no match is found.
    """
    q = select(ManagedRepo).where(ManagedRepo.git_fingerprint == hash)
    if remote_url:
        q = q.where(ManagedRepo.remote_url == remote_url)
    result = await session.execute(q)
    return list(result.scalars().all())


@router.get("/by-remote", response_model=RepoRead)
async def get_repo_by_remote_url(
    url: str,
    session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
    """Look up a repo by its git remote URL (fallback; prefer /by-fingerprint)."""
    result = await session.execute(select(ManagedRepo).where(ManagedRepo.remote_url == url))
    repo = result.scalar_one_or_none()
    if repo is None:
        raise HTTPException(status_code=404, detail=f"No repo with remote_url '{url}' found")
    return repo


@router.get("/doi/summary", response_model=list[DoISummaryEntry])
async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]:
    """Return DoI tier for all active repos, worst tier first.

    Results are cached in doi_cache. A repo is only re-evaluated when its
    fingerprint changes (repo record updated, new TPSC snapshot, goal change,
    or a key file mtime changes on disk).
    """
    repos_result = await session.execute(
        select(ManagedRepo).where(ManagedRepo.status == "active").order_by(ManagedRepo.name)
    )
    repos = list(repos_result.scalars().all())
    repo_ids = [r.id for r in repos]
    id_to_slug = {r.id: r.slug for r in repos}

    # ── Bulk DB queries for fingerprint inputs ────────────────────────────────
    domains_result = await session.execute(select(Domain))
    domain_obj_map = {d.id: d for d in domains_result.scalars().all()}
    domain_map = {d.id: d.slug for d in domain_obj_map.values()}
    domain_status = {d.slug: d.status for d in domain_obj_map.values()}

    # Latest TPSC snapshot timestamp per repo (for fingerprint + C9 count)
    tpsc_result = await session.execute(
        select(TPSCSnapshot.repo_id,
               func.count().label("cnt"),
               func.max(TPSCSnapshot.snapshot_at).label("latest"))
        .where(TPSCSnapshot.repo_id.in_(repo_ids))
        .group_by(TPSCSnapshot.repo_id)
    )
    tpsc_by_id = {row.repo_id: row for row in tpsc_result}
    tpsc_snap_counts  = {id_to_slug[rid]: row.cnt    for rid, row in tpsc_by_id.items() if rid in id_to_slug}
    tpsc_snap_latest  = {id_to_slug[rid]: str(row.latest) for rid, row in tpsc_by_id.items() if rid in id_to_slug}

    # Latest goal updated_at + active count per repo (for fingerprint + C10)
    goals_result = await session.execute(
        select(RepoGoal.repo_id,
               func.count().label("total"),
               func.sum(case((RepoGoal.status == "active", 1), else_=0)).label("active_cnt"),
               func.max(RepoGoal.updated_at).label("latest"))
        .where(RepoGoal.repo_id.in_(repo_ids))
        .group_by(RepoGoal.repo_id)
    )
    goals_by_id = {row.repo_id: row for row in goals_result}
    active_goal_counts = {id_to_slug[rid]: int(row.active_cnt or 0) for rid, row in goals_by_id.items() if rid in id_to_slug}
    goals_latest       = {id_to_slug[rid]: str(row.latest) for rid, row in goals_by_id.items() if rid in id_to_slug}

    # Load existing cache rows
    cache_result = await session.execute(
        select(DOICache).where(DOICache.repo_id.in_(repo_ids))
    )
    cache_by_repo_id = {row.repo_id: row for row in cache_result.scalars().all()}
    # ─────────────────────────────────────────────────────────────────────────

    prefetch = {
        "domain_status":      domain_status,
        "tpsc_snap_counts":   tpsc_snap_counts,
        "active_goal_counts": active_goal_counts,
    }

    async def _get_or_refresh(repo: ManagedRepo) -> DoISummaryEntry:
        slug = repo.slug
        repo_dict = {
            "slug": slug,
            "domain_slug": domain_map.get(repo.domain_id),
            "local_path": repo.local_path,
            "remote_url": repo.remote_url,
            "host_paths": repo.host_paths or {},
            "last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
            "updated_at": str(repo.updated_at) if repo.updated_at else "",
        }
        fp = compute_fingerprint(
            repo_dict,
            tpsc_snap_latest.get(slug),
            goals_latest.get(slug),
        )

        cached = cache_by_repo_id.get(repo.id)
        if cached and cached.fingerprint == fp:
            # Cache hit — return stored result
            return DoISummaryEntry(
                repo_slug=slug,
                domain_slug=domain_map.get(repo.domain_id),
                tier=cached.tier,
                core_pass=cached.core_pass,
                standard_pass=cached.standard_pass,
                full_pass=cached.full_pass,
                checked_at=cached.checked_at.isoformat(),
            )

        # Cache miss — evaluate and store
        report = await _doi_evaluate(repo_dict, skip_consistency=True, prefetch=prefetch)
        now = datetime.now(tz=timezone.utc)
        if cached:
            cached.tier          = report.tier
            cached.core_pass     = report.core_pass
            cached.standard_pass = report.standard_pass
            cached.full_pass     = report.full_pass
            cached.criteria      = [{"id": c.id, "label": c.label, "tier": c.tier,
                                      "status": c.status, "detail": c.detail}
                                     for c in report.criteria]
            cached.fingerprint   = fp
            cached.checked_at    = now
            cached.updated_at    = now
        else:
            session.add(DOICache(
                repo_id=repo.id,
                tier=report.tier,
                core_pass=report.core_pass,
                standard_pass=report.standard_pass,
                full_pass=report.full_pass,
                criteria=[{"id": c.id, "label": c.label, "tier": c.tier,
                           "status": c.status, "detail": c.detail}
                          for c in report.criteria],
                fingerprint=fp,
                checked_at=now,
                updated_at=now,
            ))

        return DoISummaryEntry(
            repo_slug=slug,
            domain_slug=domain_map.get(repo.domain_id),
            tier=report.tier,
            core_pass=report.core_pass,
            standard_pass=report.standard_pass,
            full_pass=report.full_pass,
            checked_at=now.isoformat(),
        )

    entries: list[DoISummaryEntry] = list(await asyncio.gather(*[_get_or_refresh(r) for r in repos]))
    await session.commit()

    tier_order = {"none": 0, "core": 1, "standard": 2, "full": 3}
    entries.sort(key=lambda e: tier_order.get(e.tier, 0))
    return entries


@router.get("/{slug}/doi", response_model=DoIReport)
async def get_repo_doi(
    slug: str,
    force_refresh: bool = False,
    session: AsyncSession = Depends(get_session),
) -> DoIReport:
    """Evaluate the 14 DoI criteria for a single repo (full check including C7/C13).

    Results are cached by fingerprint. Pass ?force_refresh=true to bypass the cache.
    """
    repo = await _get_repo_by_slug(slug, session)
    domain_result = await session.execute(select(Domain).where(Domain.id == repo.domain_id))
    domain_obj = domain_result.scalar_one_or_none()

    # Fingerprint inputs for this single repo
    tpsc_row = (await session.execute(
        select(func.count().label("cnt"), func.max(TPSCSnapshot.snapshot_at).label("latest"))
        .where(TPSCSnapshot.repo_id == repo.id)
    )).one()
    goal_row = (await session.execute(
        select(func.max(RepoGoal.updated_at).label("latest"))
        .where(RepoGoal.repo_id == repo.id)
    )).one()

    repo_dict = {
        "slug": repo.slug,
        "domain_slug": domain_obj.slug if domain_obj else None,
        "local_path": repo.local_path,
        "remote_url": repo.remote_url,
        "host_paths": repo.host_paths or {},
        "last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
        "updated_at": str(repo.updated_at) if repo.updated_at else "",
    }
    fp = compute_fingerprint(repo_dict, str(tpsc_row.latest) if tpsc_row.latest else None,
                             str(goal_row.latest) if goal_row.latest else None)

    # Check cache (unless force_refresh)
    cached = (await session.execute(
        select(DOICache).where(DOICache.repo_id == repo.id)
    )).scalar_one_or_none()

    if not force_refresh and cached and cached.fingerprint == fp and cached.criteria:
        return DoIReport(
            repo_slug=slug,
            tier=cached.tier,
            core_pass=cached.core_pass,
            standard_pass=cached.standard_pass,
            full_pass=cached.full_pass,
            checked_at=cached.checked_at.isoformat(),
            criteria=[DoICriterion(**c) for c in cached.criteria],
        )

    # Full evaluation (includes C7/C13 consistency subprocesses)
    report = await _doi_evaluate(repo_dict)
    now = datetime.now(tz=timezone.utc)
    criteria_json = [{"id": c.id, "label": c.label, "tier": c.tier,
                      "status": c.status, "detail": c.detail} for c in report.criteria]
    if cached:
        cached.tier = report.tier; cached.core_pass = report.core_pass
        cached.standard_pass = report.standard_pass; cached.full_pass = report.full_pass
        cached.criteria = criteria_json; cached.fingerprint = fp
        cached.checked_at = now; cached.updated_at = now
    else:
        session.add(DOICache(repo_id=repo.id, tier=report.tier,
            core_pass=report.core_pass, standard_pass=report.standard_pass,
            full_pass=report.full_pass, criteria=criteria_json,
            fingerprint=fp, checked_at=now, updated_at=now))
    await session.commit()

    return DoIReport(
        repo_slug=report.repo_slug, tier=report.tier,
        core_pass=report.core_pass, standard_pass=report.standard_pass,
        full_pass=report.full_pass, checked_at=report.checked_at,
        criteria=[DoICriterion(id=c.id, label=c.label, tier=c.tier,
                               status=c.status, detail=c.detail) for c in report.criteria],
    )


@router.get("/by-id/{repo_id}", response_model=RepoRead)
async def get_repo_by_id(
    repo_id: uuid.UUID,
    session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
    repo = await session.get(ManagedRepo, repo_id)
    if repo is None:
        raise HTTPException(status_code=404, detail=f"Repo '{repo_id}' not found")
    return repo


@router.get("/{slug}/", response_model=RepoRead)
async def get_repo(
    slug: str,
    session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
    return await _get_repo_by_slug(slug, session)


@router.patch("/{slug}/", response_model=RepoRead)
async def update_repo(
    slug: str,
    body: RepoUpdate,
    session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
    repo = await _get_repo_by_slug(slug, session)
    for field, value in body.model_dump(exclude_unset=True).items():
        setattr(repo, field, value)
    await session.commit()
    await session.refresh(repo)
    return repo


@router.post("/{slug}/paths/", response_model=RepoRead)
async def register_host_path(
    slug: str,
    body: RepoPathRegister,
    session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
    """Register or update the local path for a specific host.

    Merges {"host": path} into host_paths without overwriting other entries.
    Use this when a repo lives at a different absolute path on different machines.
    """
    repo = await _get_repo_by_slug(slug, session)
    updated = dict(repo.host_paths or {})
    updated[body.host] = body.path
    repo.host_paths = updated
    await session.commit()
    await session.refresh(repo)
    return repo


@router.patch("/{slug}/archive", response_model=RepoRead)
async def archive_repo(
    slug: str,
    session: AsyncSession = Depends(get_session),
) -> ManagedRepo:
    repo = await _get_repo_by_slug(slug, session)
    repo.status = "archived"
    await session.commit()
    await session.refresh(repo)
    return repo


@router.get("/{slug}/dispatch", response_model=RepoDispatch)
async def get_repo_dispatch(
    slug: str,
    session: AsyncSession = Depends(get_session),
) -> RepoDispatch:
    """Return active workstreams, pending tasks, and goal for a repo.

    This endpoint is the foundation for autonomous agent sessions: an agent can
    call it at session start to discover what work is pending without needing to
    read state-hub summary or scan workplan files manually.
    """
    repo = await _get_repo_by_slug(slug, session)

    # Active goal
    goal_result = await session.execute(
        select(RepoGoal)
        .where(RepoGoal.repo_id == repo.id, RepoGoal.status == "active")
        .order_by(RepoGoal.priority)
        .limit(1)
    )
    goal_obj = goal_result.scalar_one_or_none()
    active_goal = None
    if goal_obj:
        active_goal = {
            "id": str(goal_obj.id),
            "title": goal_obj.title,
            "description": goal_obj.description,
            "priority": goal_obj.priority,
        }

    # Active workstreams
    ws_result = await session.execute(
        select(Workstream)
        .where(Workstream.repo_id == repo.id, Workstream.status == "active")
        .order_by(Workstream.created_at)
    )
    workstreams = list(ws_result.scalars().all())

    dispatch_workstreams: list[DispatchWorkstream] = []
    all_interventions: list[DispatchTask] = []

    for ws in workstreams:
        task_result = await session.execute(
            select(Task)
            .where(Task.workstream_id == ws.id, Task.status.in_(["todo", "in_progress"]))
            .order_by(Task.created_at)
        )
        tasks = list(task_result.scalars().all())

        pending = [
            DispatchTask(
                id=t.id,
                title=t.title,
                priority=t.priority,
                status=t.status,
                needs_human=t.needs_human,
            )
            for t in tasks
        ]
        interventions = [t for t in pending if t.needs_human]
        all_interventions.extend(interventions)

        dispatch_workstreams.append(
            DispatchWorkstream(
                id=ws.id,
                title=ws.title,
                status=ws.status,
                pending_tasks=pending,
            )
        )

    return RepoDispatch(
        repo_slug=slug,
        active_goal=active_goal,
        active_workstreams=dispatch_workstreams,
        human_interventions=all_interventions,
        last_state_synced_at=repo.last_state_synced_at,
    )


async def _get_repo_by_slug(slug: str, session: AsyncSession) -> ManagedRepo:
    result = await session.execute(select(ManagedRepo).where(ManagedRepo.slug == slug))
    repo = result.scalar_one_or_none()
    if repo is None:
        raise HTTPException(status_code=404, detail=f"Repo '{slug}' not found")
    return repo