perf(doi): eliminate HTTP self-calls in summary — 48 calls → 3 bulk DB queries

Root cause: C2/C9/C10 each made a full HTTP round-trip back to the API
(asyncio.to_thread → urllib → TCP → uvicorn → SQLAlchemy → DB) for every
repo. 16 repos × 3 calls = 48 self-calls at ~80-150ms each = ~6s total.

Fix: doi_engine.evaluate() accepts a prefetch dict. The summary endpoint
runs 3 bulk GROUP BY queries (domain status, TPSC snapshot counts, active
goal counts) and passes results directly — zero HTTP self-calls in summary
mode.

Result: /repos/doi/summary 6s → <1s (6× improvement on top of prior 13×).
Total improvement from original: 108s → <1s.

Co-Authored-By: Claude Sonnet 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-03-20 01:37:40 +01:00
parent 9ba1501b49
commit 245cd72ba3
2 changed files with 75 additions and 18 deletions

View File

@@ -114,7 +114,21 @@ async def evaluate(
repo: dict,
api_base: str = "http://127.0.0.1:8000",
skip_consistency: bool = False,
prefetch: dict | None = None,
) -> DoIReport:
"""Evaluate all 14 DoI criteria for a repo.
Args:
repo: Repo dict (slug, domain_slug, local_path, remote_url, host_paths, last_sbom_at).
api_base: API base URL — only used when prefetch is absent.
skip_consistency: Skip C7/C13 subprocess calls (used in summary mode).
prefetch: Optional pre-fetched bulk data to avoid HTTP self-calls:
{
"domain_status": {"custodian": "active", ...}, # slug → status
"tpsc_snap_counts": {"llm-connect": 1, ...}, # repo_slug → count
"active_goal_counts": {"llm-connect": 0, ...}, # repo_slug → count
}
"""
slug = repo.get("slug", "unknown")
results: list[CriterionResult] = []
@@ -133,11 +147,15 @@ async def evaluate(
if not domain_slug:
_r("C2", "Domain assigned", "core", "fail", "No domain_slug on repo record")
else:
domain = await _get(api_base, f"/domains/{domain_slug}/")
if domain and domain.get("status") == "active":
if prefetch and "domain_status" in prefetch:
dom_status = prefetch["domain_status"].get(domain_slug)
else:
d = await _get(api_base, f"/domains/{domain_slug}/")
dom_status = d.get("status") if d else None
if dom_status == "active":
_r("C2", "Domain assigned", "core", "pass", f"domain: {domain_slug}")
elif domain:
_r("C2", "Domain assigned", "core", "warn", f"Domain '{domain_slug}' status: {domain.get('status')}")
elif dom_status:
_r("C2", "Domain assigned", "core", "warn", f"Domain '{domain_slug}' status: {dom_status}")
else:
_r("C2", "Domain assigned", "core", "fail", f"Domain '{domain_slug}' not found")
@@ -196,12 +214,17 @@ async def evaluate(
# C9: TPSC declared (tpsc.yaml present + snapshot exists)
tpsc_file_ok = repo_path and (Path(repo_path) / "tpsc.yaml").exists()
tpsc_snaps = await _get(api_base, "/tpsc/snapshots/", {"repo_slug": slug}) or []
has_snap = len(tpsc_snaps) > 0
if prefetch and "tpsc_snap_counts" in prefetch:
has_snap = (prefetch["tpsc_snap_counts"].get(slug, 0) > 0)
snap_count = prefetch["tpsc_snap_counts"].get(slug, 0)
else:
tpsc_snaps = await _get(api_base, "/tpsc/snapshots/", {"repo_slug": slug}) or []
has_snap = len(tpsc_snaps) > 0
snap_count = len(tpsc_snaps)
if not repo_path:
_r("C9", "TPSC declared", "standard", "skip", "Local path unavailable")
elif tpsc_file_ok and has_snap:
_r("C9", "TPSC declared", "standard", "pass", f"{len(tpsc_snaps)} snapshot(s)")
_r("C9", "TPSC declared", "standard", "pass", f"{snap_count} snapshot(s)")
elif tpsc_file_ok and not has_snap:
_r("C9", "TPSC declared", "standard", "warn", "tpsc.yaml exists but not yet ingested — run make ingest-tpsc")
elif not tpsc_file_ok:
@@ -210,10 +233,13 @@ async def evaluate(
# ── Tier 3: Full ─────────────────────────────────────────────────────────
# C10: active repo goal
goals = await _get(api_base, "/repo-goals/", {"repo_slug": slug}) or []
active_goals = [g for g in goals if g.get("status") == "active"]
if active_goals:
_r("C10", "Active repo goal", "full", "pass", f"{len(active_goals)} active goal(s)")
if prefetch and "active_goal_counts" in prefetch:
active_goal_count = prefetch["active_goal_counts"].get(slug, 0)
else:
goals = await _get(api_base, "/repo-goals/", {"repo_slug": slug}) or []
active_goal_count = sum(1 for g in goals if g.get("status") == "active")
if active_goal_count > 0:
_r("C10", "Active repo goal", "full", "pass", f"{active_goal_count} active goal(s)")
else:
_r("C10", "Active repo goal", "full", "fail", "No active repo goal — create one with create_repo_goal()")