perf(doi): eliminate HTTP self-calls in summary — 48 calls → 3 bulk DB queries

Root cause: C2/C9/C10 each made a full HTTP round-trip back to the API
(asyncio.to_thread → urllib → TCP → uvicorn → SQLAlchemy → DB) for every
repo. 16 repos × 3 calls = 48 self-calls at ~80-150ms each = ~6s total.

Fix: doi_engine.evaluate() accepts a prefetch dict. The summary endpoint
runs 3 bulk GROUP BY queries (domain status, TPSC snapshot counts, active
goal counts) and passes results directly — zero HTTP self-calls in summary
mode.

Result: /repos/doi/summary 6s → <1s (6× improvement on top of prior 13×).
Total improvement from original: 108s → <1s.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-20 01:37:40 +01:00
parent 9ba1501b49
commit 245cd72ba3
2 changed files with 75 additions and 18 deletions

View File

@@ -5,11 +5,14 @@ from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import func
from api.database import get_session
from api.doi_engine import evaluate as _doi_evaluate
from api.models.domain import Domain
from api.models.managed_repo import ManagedRepo
from api.models.repo_goal import RepoGoal
from api.models.tpsc import TPSCSnapshot
from api.models.task import Task
from api.models.workstream import Workstream
from api.schemas.doi import DoICriterion, DoIReport, DoISummaryEntry
@@ -74,12 +77,42 @@ async def register_repo(
@router.get("/doi/summary", response_model=list[DoISummaryEntry])
async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]:
"""Return DoI tier for all active repos, worst tier first."""
result = await session.execute(
repos_result = await session.execute(
select(ManagedRepo).where(ManagedRepo.status == "active").order_by(ManagedRepo.name)
)
repos = list(result.scalars().all())
domain_result = await session.execute(select(Domain))
domain_map = {d.id: d.slug for d in domain_result.scalars().all()}
repos = list(repos_result.scalars().all())
# ── 3 bulk DB queries instead of 48 HTTP self-calls ───────────────────────
# C2: domain status by slug
domains_result = await session.execute(select(Domain))
domain_obj_map = {d.id: d for d in domains_result.scalars().all()}
domain_map = {d.id: d.slug for d in domain_obj_map.values()}
domain_status = {d.slug: d.status for d in domain_obj_map.values()}
# C9: TPSC snapshot count per repo_id
repo_ids = [r.id for r in repos]
tpsc_result = await session.execute(
select(TPSCSnapshot.repo_id, func.count().label("cnt"))
.where(TPSCSnapshot.repo_id.in_(repo_ids))
.group_by(TPSCSnapshot.repo_id)
)
id_to_slug = {r.id: r.slug for r in repos}
tpsc_snap_counts = {id_to_slug[row.repo_id]: row.cnt for row in tpsc_result if row.repo_id in id_to_slug}
# C10: active repo goal count per repo_id
goals_result = await session.execute(
select(RepoGoal.repo_id, func.count().label("cnt"))
.where(RepoGoal.repo_id.in_(repo_ids), RepoGoal.status == "active")
.group_by(RepoGoal.repo_id)
)
active_goal_counts = {id_to_slug[row.repo_id]: row.cnt for row in goals_result if row.repo_id in id_to_slug}
prefetch = {
"domain_status": domain_status,
"tpsc_snap_counts": tpsc_snap_counts,
"active_goal_counts": active_goal_counts,
}
# ─────────────────────────────────────────────────────────────────────────
async def _check_one(repo: ManagedRepo) -> DoISummaryEntry:
repo_dict = {
@@ -90,9 +123,7 @@ async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoIS
"host_paths": repo.host_paths or {},
"last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
}
# skip_consistency=True: omits C7/C13 subprocess calls for speed.
# The full check is available via GET /repos/{slug}/doi.
report = await _doi_evaluate(repo_dict, skip_consistency=True)
report = await _doi_evaluate(repo_dict, skip_consistency=True, prefetch=prefetch)
return DoISummaryEntry(
repo_slug=repo.slug,
domain_slug=domain_map.get(repo.domain_id),