perf(doi): eliminate HTTP self-calls in summary — 48 calls → 3 bulk DB queries

Root cause: C2/C9/C10 each made a full HTTP round-trip back to the API
(asyncio.to_thread → urllib → TCP → uvicorn → SQLAlchemy → DB) for every
repo. 16 repos × 3 calls = 48 self-calls at ~80-150ms each = ~6s total.

Fix: doi_engine.evaluate() accepts a prefetch dict. The summary endpoint
runs 3 bulk GROUP BY queries (domain status, TPSC snapshot counts, active
goal counts) and passes results directly — zero HTTP self-calls in summary
mode.

Result: /repos/doi/summary 6s → <1s (6× improvement on top of prior 13×).
Total improvement from original: 108s → <1s.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-20 01:37:40 +01:00
parent 9ba1501b49
commit 245cd72ba3
2 changed files with 75 additions and 18 deletions

View File

@@ -114,7 +114,21 @@ async def evaluate(
repo: dict, repo: dict,
api_base: str = "http://127.0.0.1:8000", api_base: str = "http://127.0.0.1:8000",
skip_consistency: bool = False, skip_consistency: bool = False,
prefetch: dict | None = None,
) -> DoIReport: ) -> DoIReport:
"""Evaluate all 14 DoI criteria for a repo.
Args:
repo: Repo dict (slug, domain_slug, local_path, remote_url, host_paths, last_sbom_at).
api_base: API base URL — only used when prefetch is absent.
skip_consistency: Skip C7/C13 subprocess calls (used in summary mode).
prefetch: Optional pre-fetched bulk data to avoid HTTP self-calls:
{
"domain_status": {"custodian": "active", ...}, # slug → status
"tpsc_snap_counts": {"llm-connect": 1, ...}, # repo_slug → count
"active_goal_counts": {"llm-connect": 0, ...}, # repo_slug → count
}
"""
slug = repo.get("slug", "unknown") slug = repo.get("slug", "unknown")
results: list[CriterionResult] = [] results: list[CriterionResult] = []
@@ -133,11 +147,15 @@ async def evaluate(
if not domain_slug: if not domain_slug:
_r("C2", "Domain assigned", "core", "fail", "No domain_slug on repo record") _r("C2", "Domain assigned", "core", "fail", "No domain_slug on repo record")
else: else:
domain = await _get(api_base, f"/domains/{domain_slug}/") if prefetch and "domain_status" in prefetch:
if domain and domain.get("status") == "active": dom_status = prefetch["domain_status"].get(domain_slug)
else:
d = await _get(api_base, f"/domains/{domain_slug}/")
dom_status = d.get("status") if d else None
if dom_status == "active":
_r("C2", "Domain assigned", "core", "pass", f"domain: {domain_slug}") _r("C2", "Domain assigned", "core", "pass", f"domain: {domain_slug}")
elif domain: elif dom_status:
_r("C2", "Domain assigned", "core", "warn", f"Domain '{domain_slug}' status: {domain.get('status')}") _r("C2", "Domain assigned", "core", "warn", f"Domain '{domain_slug}' status: {dom_status}")
else: else:
_r("C2", "Domain assigned", "core", "fail", f"Domain '{domain_slug}' not found") _r("C2", "Domain assigned", "core", "fail", f"Domain '{domain_slug}' not found")
@@ -196,12 +214,17 @@ async def evaluate(
# C9: TPSC declared (tpsc.yaml present + snapshot exists) # C9: TPSC declared (tpsc.yaml present + snapshot exists)
tpsc_file_ok = repo_path and (Path(repo_path) / "tpsc.yaml").exists() tpsc_file_ok = repo_path and (Path(repo_path) / "tpsc.yaml").exists()
tpsc_snaps = await _get(api_base, "/tpsc/snapshots/", {"repo_slug": slug}) or [] if prefetch and "tpsc_snap_counts" in prefetch:
has_snap = len(tpsc_snaps) > 0 has_snap = (prefetch["tpsc_snap_counts"].get(slug, 0) > 0)
snap_count = prefetch["tpsc_snap_counts"].get(slug, 0)
else:
tpsc_snaps = await _get(api_base, "/tpsc/snapshots/", {"repo_slug": slug}) or []
has_snap = len(tpsc_snaps) > 0
snap_count = len(tpsc_snaps)
if not repo_path: if not repo_path:
_r("C9", "TPSC declared", "standard", "skip", "Local path unavailable") _r("C9", "TPSC declared", "standard", "skip", "Local path unavailable")
elif tpsc_file_ok and has_snap: elif tpsc_file_ok and has_snap:
_r("C9", "TPSC declared", "standard", "pass", f"{len(tpsc_snaps)} snapshot(s)") _r("C9", "TPSC declared", "standard", "pass", f"{snap_count} snapshot(s)")
elif tpsc_file_ok and not has_snap: elif tpsc_file_ok and not has_snap:
_r("C9", "TPSC declared", "standard", "warn", "tpsc.yaml exists but not yet ingested — run make ingest-tpsc") _r("C9", "TPSC declared", "standard", "warn", "tpsc.yaml exists but not yet ingested — run make ingest-tpsc")
elif not tpsc_file_ok: elif not tpsc_file_ok:
@@ -210,10 +233,13 @@ async def evaluate(
# ── Tier 3: Full ───────────────────────────────────────────────────────── # ── Tier 3: Full ─────────────────────────────────────────────────────────
# C10: active repo goal # C10: active repo goal
goals = await _get(api_base, "/repo-goals/", {"repo_slug": slug}) or [] if prefetch and "active_goal_counts" in prefetch:
active_goals = [g for g in goals if g.get("status") == "active"] active_goal_count = prefetch["active_goal_counts"].get(slug, 0)
if active_goals: else:
_r("C10", "Active repo goal", "full", "pass", f"{len(active_goals)} active goal(s)") goals = await _get(api_base, "/repo-goals/", {"repo_slug": slug}) or []
active_goal_count = sum(1 for g in goals if g.get("status") == "active")
if active_goal_count > 0:
_r("C10", "Active repo goal", "full", "pass", f"{active_goal_count} active goal(s)")
else: else:
_r("C10", "Active repo goal", "full", "fail", "No active repo goal — create one with create_repo_goal()") _r("C10", "Active repo goal", "full", "fail", "No active repo goal — create one with create_repo_goal()")

View File

@@ -5,11 +5,14 @@ from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import func
from api.database import get_session from api.database import get_session
from api.doi_engine import evaluate as _doi_evaluate from api.doi_engine import evaluate as _doi_evaluate
from api.models.domain import Domain from api.models.domain import Domain
from api.models.managed_repo import ManagedRepo from api.models.managed_repo import ManagedRepo
from api.models.repo_goal import RepoGoal from api.models.repo_goal import RepoGoal
from api.models.tpsc import TPSCSnapshot
from api.models.task import Task from api.models.task import Task
from api.models.workstream import Workstream from api.models.workstream import Workstream
from api.schemas.doi import DoICriterion, DoIReport, DoISummaryEntry from api.schemas.doi import DoICriterion, DoIReport, DoISummaryEntry
@@ -74,12 +77,42 @@ async def register_repo(
@router.get("/doi/summary", response_model=list[DoISummaryEntry]) @router.get("/doi/summary", response_model=list[DoISummaryEntry])
async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]: async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoISummaryEntry]:
"""Return DoI tier for all active repos, worst tier first.""" """Return DoI tier for all active repos, worst tier first."""
result = await session.execute( repos_result = await session.execute(
select(ManagedRepo).where(ManagedRepo.status == "active").order_by(ManagedRepo.name) select(ManagedRepo).where(ManagedRepo.status == "active").order_by(ManagedRepo.name)
) )
repos = list(result.scalars().all()) repos = list(repos_result.scalars().all())
domain_result = await session.execute(select(Domain))
domain_map = {d.id: d.slug for d in domain_result.scalars().all()} # ── 3 bulk DB queries instead of 48 HTTP self-calls ───────────────────────
# C2: domain status by slug
domains_result = await session.execute(select(Domain))
domain_obj_map = {d.id: d for d in domains_result.scalars().all()}
domain_map = {d.id: d.slug for d in domain_obj_map.values()}
domain_status = {d.slug: d.status for d in domain_obj_map.values()}
# C9: TPSC snapshot count per repo_id
repo_ids = [r.id for r in repos]
tpsc_result = await session.execute(
select(TPSCSnapshot.repo_id, func.count().label("cnt"))
.where(TPSCSnapshot.repo_id.in_(repo_ids))
.group_by(TPSCSnapshot.repo_id)
)
id_to_slug = {r.id: r.slug for r in repos}
tpsc_snap_counts = {id_to_slug[row.repo_id]: row.cnt for row in tpsc_result if row.repo_id in id_to_slug}
# C10: active repo goal count per repo_id
goals_result = await session.execute(
select(RepoGoal.repo_id, func.count().label("cnt"))
.where(RepoGoal.repo_id.in_(repo_ids), RepoGoal.status == "active")
.group_by(RepoGoal.repo_id)
)
active_goal_counts = {id_to_slug[row.repo_id]: row.cnt for row in goals_result if row.repo_id in id_to_slug}
prefetch = {
"domain_status": domain_status,
"tpsc_snap_counts": tpsc_snap_counts,
"active_goal_counts": active_goal_counts,
}
# ─────────────────────────────────────────────────────────────────────────
async def _check_one(repo: ManagedRepo) -> DoISummaryEntry: async def _check_one(repo: ManagedRepo) -> DoISummaryEntry:
repo_dict = { repo_dict = {
@@ -90,9 +123,7 @@ async def doi_summary(session: AsyncSession = Depends(get_session)) -> list[DoIS
"host_paths": repo.host_paths or {}, "host_paths": repo.host_paths or {},
"last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None, "last_sbom_at": str(repo.last_sbom_at) if repo.last_sbom_at else None,
} }
# skip_consistency=True: omits C7/C13 subprocess calls for speed. report = await _doi_evaluate(repo_dict, skip_consistency=True, prefetch=prefetch)
# The full check is available via GET /repos/{slug}/doi.
report = await _doi_evaluate(repo_dict, skip_consistency=True)
return DoISummaryEntry( return DoISummaryEntry(
repo_slug=repo.slug, repo_slug=repo.slug,
domain_slug=domain_map.get(repo.domain_id), domain_slug=domain_map.get(repo.domain_id),