feat(api): dashboard poll optimisation — T1, T2, T3

T1: Cache-Control max-age=60 on /topics/, /repos/, /domains/ list endpoints so repeated dashboard polls within a minute are served from browser cache. T2: ETag middleware (md5 hash) on all JSON GET responses with conditional-GET (304 Not Modified) support; If-None-Match and ETag added to CORS headers. ETag registered inside CORS so 304s automatically carry CORS headers. T3: GET /state/deps — lightweight dep-graph endpoint returning open workstreams with depends_on/blocks edges only, skipping the 10-table full-summary query. Prerequisite for T4 (switching workstreams.md and dependencies.md off /state/summary). Workplan: CUST-WP-0039-dashboard-poll-optimization.md Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-11 17:26:30 +02:00
parent 6f3a46dd07
commit 512c0a73ed
6 changed files with 340 additions and 4 deletions
--- a/state-hub/api/main.py
+++ b/state-hub/api/main.py
@@ -1,8 +1,12 @@
+import hashlib
 import os
 from contextlib import asynccontextmanager

 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+from starlette.responses import Response as StarletteResponse

 from api.database import engine
 from api.routers import decisions, extension_points, progress, state, tasks, technical_debt, topics, workstreams, workstream_dependencies
@@ -12,6 +16,40 @@ from api.routers import interface_changes
 from api.routers import flows


+class ETagMiddleware(BaseHTTPMiddleware):
+    """Add ETag + conditional-GET (304) support to all JSON GET responses."""
+
+    async def dispatch(self, request: Request, call_next):
+        response = await call_next(request)
+        if request.method != "GET":
+            return response
+        if "application/json" not in response.headers.get("content-type", ""):
+            return response
+
+        body_parts = []
+        async for chunk in response.body_iterator:
+            body_parts.append(chunk)
+        body = b"".join(body_parts)
+
+        etag = '"' + hashlib.md5(body, usedforsecurity=False).hexdigest() + '"'
+        if request.headers.get("if-none-match") == etag:
+            return StarletteResponse(
+                status_code=304,
+                headers={"ETag": etag, "Cache-Control": "no-cache"},
+            )
+
+        headers = {k: v for k, v in response.headers.items() if k.lower() != "content-length"}
+        headers["ETag"] = etag
+        if not any(k.lower() == "cache-control" for k in headers):
+            headers["Cache-Control"] = "no-cache"
+        return StarletteResponse(
+            content=body,
+            status_code=response.status_code,
+            headers=headers,
+            media_type=response.media_type,
+        )
+
+
@asynccontextmanager
 async def lifespan(app: FastAPI):
    yield
@@ -28,11 +66,13 @@ app = FastAPI(
 _cors_env = os.getenv("CORS_ORIGINS", "http://localhost:3000,http://127.0.0.1:3000")
 _cors_origins = [o.strip() for o in _cors_env.split(",") if o.strip()]

+app.add_middleware(ETagMiddleware)
 app.add_middleware(
    CORSMiddleware,
    allow_origins=_cors_origins,
    allow_methods=["GET", "POST", "PATCH", "DELETE", "PUT"],
-    allow_headers=["Content-Type"],
+    allow_headers=["Content-Type", "If-None-Match"],
+    expose_headers=["ETag"],
 )

 app.include_router(domains.router)
--- a/state-hub/api/routers/domains.py
+++ b/state-hub/api/routers/domains.py
@@ -1,6 +1,6 @@
 import uuid

-from fastapi import APIRouter, Depends, HTTPException, Query, status
+from fastapi import APIRouter, Depends, HTTPException, Query, Response, status
 from sqlalchemy import func, select
 from sqlalchemy.ext.asyncio import AsyncSession

@@ -18,9 +18,11 @@ router = APIRouter(prefix="/domains", tags=["domains"])

@router.get("/", response_model=list[DomainRead])
 async def list_domains(
+    response: Response,
    status: str | None = Query(None, description="active | archived | all"),
    session: AsyncSession = Depends(get_session),
 ) -> list[Domain]:
+    response.headers["Cache-Control"] = "max-age=60, stale-while-revalidate=30"
    q = select(Domain).order_by(Domain.name)
    if status and status != "all":
        q = q.where(Domain.status == status)
--- a/state-hub/api/routers/repos.py
+++ b/state-hub/api/routers/repos.py
@@ -9,7 +9,7 @@ import uuid
 from datetime import datetime, timezone
 from pathlib import Path

-from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi import APIRouter, Depends, HTTPException, Response, status
 from sqlalchemy import case, func, select
 from sqlalchemy.ext.asyncio import AsyncSession

@@ -50,9 +50,11 @@ router = APIRouter(prefix="/repos", tags=["repos"])

@router.get("/", response_model=list[RepoRead])
 async def list_repos(
+    response: Response,
    domain: str | None = None,
    session: AsyncSession = Depends(get_session),
 ) -> list[ManagedRepo]:
+    response.headers["Cache-Control"] = "max-age=60, stale-while-revalidate=30"
    q = select(ManagedRepo).order_by(ManagedRepo.name)
    if domain:
        domain_row = await session.execute(select(Domain).where(Domain.slug == domain))
--- a/state-hub/api/routers/state.py
+++ b/state-hub/api/routers/state.py
@@ -379,6 +379,87 @@ async def _build_domain_summaries(session: AsyncSession) -> list[DomainSummary]:
    ]


+@router.get("/deps", response_model=list[WorkstreamWithDeps])
+async def get_deps(session: AsyncSession = Depends(get_session)) -> list[WorkstreamWithDeps]:
+    """Lightweight dep-graph endpoint: open workstreams with their dependency edges only.
+
+    Returns the same structure as open_workstreams in /state/summary but skips
+    the 10-table full-summary computation. Task counts are omitted (all zero).
+    Used by workstreams.md and dependencies.md which only need dep edges.
+    """
+    open_ws_rows = await session.execute(
+        select(Workstream)
+        .options(noload("*"))
+        .where(Workstream.status.in_(["active", "blocked"]))
+        .order_by(Workstream.due_date.asc().nullslast(), Workstream.created_at)
+    )
+    open_ws = list(open_ws_rows.scalars().all())
+
+    open_ws_ids = [w.id for w in open_ws]
+    dep_rows = []
+    if open_ws_ids:
+        dep_result = await session.execute(
+            select(WorkstreamDependency).where(
+                (WorkstreamDependency.from_workstream_id.in_(open_ws_ids))
+                | (WorkstreamDependency.to_workstream_id.in_(open_ws_ids))
+            )
+        )
+        dep_rows = list(dep_result.scalars().all())
+
+    dep_ws_ids: set = set()
+    dep_task_ids: set = set()
+    for d in dep_rows:
+        dep_ws_ids.add(d.from_workstream_id)
+        if d.to_workstream_id:
+            dep_ws_ids.add(d.to_workstream_id)
+        if d.to_task_id:
+            dep_task_ids.add(d.to_task_id)
+
+    ws_lookup: dict = {w.id: w for w in open_ws}
+    extra_ids = dep_ws_ids - set(ws_lookup.keys())
+    if extra_ids:
+        extra_rows = await session.execute(
+            select(Workstream).options(noload("*")).where(Workstream.id.in_(extra_ids))
+        )
+        for w in extra_rows.scalars():
+            ws_lookup[w.id] = w
+
+    task_lookup: dict = {}
+    if dep_task_ids:
+        task_rows = await session.execute(select(Task).options(noload("*")).where(Task.id.in_(dep_task_ids)))
+        task_lookup = {t.id: t for t in task_rows.scalars().all()}
+
+    dep_index: dict = {w.id: {"depends_on": [], "blocks": []} for w in open_ws}
+    for d in dep_rows:
+        from_id, to_id, task_id = d.from_workstream_id, d.to_workstream_id, d.to_task_id
+        if from_id in dep_index and to_id and to_id in ws_lookup:
+            dep_index[from_id]["depends_on"].append(WorkstreamDepStub(
+                dep_id=d.id, target_type="workstream", relationship_type=d.relationship_type,
+                workstream_id=to_id, workstream_slug=ws_lookup[to_id].slug,
+                workstream_title=ws_lookup[to_id].title, description=d.description,
+            ))
+        if from_id in dep_index and task_id and task_id in task_lookup:
+            dep_index[from_id]["depends_on"].append(WorkstreamDepStub(
+                dep_id=d.id, target_type="task", relationship_type=d.relationship_type,
+                task_id=task_id, task_title=task_lookup[task_id].title, description=d.description,
+            ))
+        if to_id and to_id in dep_index and from_id in ws_lookup:
+            dep_index[to_id]["blocks"].append(WorkstreamDepStub(
+                dep_id=d.id, target_type="workstream", relationship_type=d.relationship_type,
+                workstream_id=from_id, workstream_slug=ws_lookup[from_id].slug,
+                workstream_title=ws_lookup[from_id].title, description=d.description,
+            ))
+
+    return [
+        WorkstreamWithDeps(
+            **WorkstreamRead.model_validate(w).model_dump(),
+            depends_on=dep_index[w.id]["depends_on"],
+            blocks=dep_index[w.id]["blocks"],
+        )
+        for w in open_ws
+    ]
+
+
 _PRIORITY_RANK = {
    TaskPriority.critical: 0,
    TaskPriority.high: 1,
--- a/state-hub/api/routers/topics.py
+++ b/state-hub/api/routers/topics.py
@@ -1,6 +1,6 @@
 import uuid

-from fastapi import APIRouter, Depends, HTTPException, status
+from fastapi import APIRouter, Depends, HTTPException, Response, status
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession

@@ -23,9 +23,11 @@ async def _resolve_domain_id(domain_slug: str, session: AsyncSession) -> uuid.UU

@router.get("/", response_model=list[TopicRead])
 async def list_topics(
+    response: Response,
    status: TopicStatus | None = None,
    session: AsyncSession = Depends(get_session),
 ) -> list[Topic]:
+    response.headers["Cache-Control"] = "max-age=60, stale-while-revalidate=30"
    q = select(Topic)
    if status:
        q = q.where(Topic.status == status)
--- a/workplans/CUST-WP-0039-dashboard-poll-optimization.md
+++ b/workplans/CUST-WP-0039-dashboard-poll-optimization.md
@@ -0,0 +1,209 @@
+---
+id: CUST-WP-0039
+type: workplan
+title: "Dashboard Poll Optimization"
+domain: custodian
+status: todo
+owner: custodian
+topic_slug: custodian
+created: "2026-05-11"
+updated: "2026-05-11"
+state_hub_workstream_id: "d5ffb008-a517-4b8b-86ce-093fcc285fb3"
+---
+
+# Dashboard Poll Optimization
+
+## Problem
+
+With `uvicorn --reload` watching `.venv/` now fixed (CUST-WP-0039 precursor), the
+remaining sustained load on the API worker comes from the dashboard polling pattern:
+
+- **24 pages**, 14 with active polling loops (POLL_HEAVY = 60 s, POLL = 15 s)
+- **`index.md` alone** runs 4 independent polling loops firing 11 API calls per cycle:
+  `/state/summary`, `/sbom/snapshots/`, `/progress/`, `/workstreams/`, `/tasks/?limit=2000`,
+  `/topics/`, `/repos/`, `/workstreams/workplan-index`
+- **`workstreams.md` and `dependencies.md`** each call `/state/summary` (the most
+  expensive endpoint — queries 10+ tables) every 60 s just to extract dependency
+  edges from `open_workstreams[].depends_on`
+- **Reference data** (`/topics/`, `/repos/`) is fetched independently by 10+ pages
+  every 60 s with no caching; these datasets change rarely
+- **Background tabs** still poll at 120 s (`POLL_HIDDEN`) — they could pause entirely
+
+## Goals
+
+Reduce API request rate and per-request cost when the dashboard is open, without
+degrading UX or data freshness for the pages the user is actively viewing.
+
+## Out of scope
+
+- SSE / WebSocket push (would require significant API rework)
+- Observable data loaders / static build mode (different deployment model)
+- BroadcastChannel cross-tab sharing (nice-to-have, not in this workplan)
+
+---
+
+## Tasks
+
+### T1 — Add Cache-Control headers to reference endpoints
+
+```task
+id: CUST-WP-0039-T1
+status: todo
+priority: high
+state_hub_task_id: "b36713d8-d1d5-43c5-86c3-e22f72b68d62"
+```
+
+Add `Cache-Control: max-age=60, stale-while-revalidate=30` to the list responses
+for `/topics/`, `/repos/`, and `/domains/`. These datasets change only when a human
+explicitly creates/renames a domain or registers a repo — never on their own.
+
+Browser-level caching means that when 10 pages all fetch `/topics/` within a 60 s
+window, only the first request hits the API; the rest are served from cache.
+
+**Implementation:** Add a FastAPI middleware or a response-header dependency in
+`api/routers/topics.py`, `repos.py`, and `domains.py` list endpoints. Use
+`from fastapi.responses import Response` + `response.headers["Cache-Control"]`, or
+a shared `cache_headers` dependency.
+
+---
+
+### T2 — Add ETag support to high-frequency list endpoints
+
+```task
+id: CUST-WP-0039-T2
+status: todo
+priority: high
+state_hub_task_id: "75f1c2cd-0baf-4747-8c67-1dbfa81bde41"
+```
+
+Add `ETag` (content hash of the response body) and handle `If-None-Match` for
+`/workstreams/`, `/tasks/`, and `/state/summary`. When the data hasn't changed the
+API returns `304 Not Modified` with no body — roughly 95% smaller than a full
+response.
+
+**Implementation:**
+- Add a FastAPI middleware (in `api/main.py`) that intercepts JSON list responses,
+  computes `md5(body)`, sets `ETag: "<hash>"`, and returns 304 if the request
+  carries a matching `If-None-Match` header.
+- No client changes needed — `fetch()` respects ETags automatically when the
+  response includes `Cache-Control: no-cache` (which forces revalidation but
+  allows 304).
+
+---
+
+### T3 — Add lightweight `/state/deps` endpoint
+
+```task
+id: CUST-WP-0039-T3
+status: todo
+priority: high
+state_hub_task_id: "cb7608d3-5dad-4b51-9b91-080539f7aa65"
+```
+
+`workstreams.md` and `dependencies.md` call `/state/summary` (a ~10-table query)
+only to extract `open_workstreams[].{id, depends_on, blocks}`. Add a dedicated
+endpoint that returns just this:
+
+```json
+GET /state/deps
+→ [{"id": "...", "title": "...", "status": "...", "depends_on": [...], "blocks": [...]}]
+```
+
+Query: `SELECT id, title, status FROM workstreams WHERE status IN ('active','blocked')`
+plus the dependency join — roughly 1/10th the work of the full summary.
+
+**Implementation:** New route in `api/routers/state.py` (or a new `deps.py`).
+Schema: `WorkstreamDepStub` already exists in `api/schemas/workstream_dependency.py`
+— reuse or extend it.
+
+---
+
+### T4 — Replace `/state/summary` in workstreams.md and dependencies.md
+
+```task
+id: CUST-WP-0039-T4
+status: todo
+priority: medium
+depends_on: [CUST-WP-0039-T3]
+state_hub_task_id: "b80dce9c-b1ef-4606-9460-5100d6f58bce"
+```
+
+Switch `workstreams.md` and `dependencies.md` to use the new `/state/deps` endpoint
+instead of the full `/state/summary`. Both pages construct a dep-edge map from
+`open_workstreams[].depends_on`; `/state/deps` provides exactly that.
+
+Changes:
+- `dashboard/src/workstreams.md`: replace `apiFetch("/state/summary", ...)` with
+  `apiFetch("/state/deps")`, update the variable extraction (`openWs = depsData`)
+- `dashboard/src/dependencies.md`: same substitution, update edge-building loop
+
+---
+
+### T5 — Consolidate index.md's 4 polling loops into 1
+
+```task
+id: CUST-WP-0039-T5
+status: todo
+priority: medium
+state_hub_task_id: "7c2d5e01-9de5-48ad-aa0b-a37cf5332ad9"
+```
+
+`index.md` runs 4 independent `while(true)` generators (`summaryState`,
+`sbomSnapState`, `regsState`, `wsChartState`) that each sleep 60 s independently.
+They were split because different sections needed different data, but they all use
+POLL_HEAVY and can be unified into a single loop with one `Promise.all` that fetches
+all 8 endpoints together.
+
+Benefits:
+- 4 timers → 1: simpler, predictable, backoff applies uniformly
+- Fetch batching: all 8 requests fire simultaneously, most finish within the same
+  server round-trip window
+- Simpler failure handling: one `failures` counter, one backoff
+
+Approach: single `pageState` generator that yields a flat object with all fields
+(summary, snapshots, milestones, wsAll). Destructure at the use sites.
+
+---
+
+### T6 — Full visibility-based polling pause in config.js
+
+```task
+id: CUST-WP-0039-T6
+status: todo
+priority: low
+state_hub_task_id: "31b6a353-040a-4f87-b2f1-1deab5cf6191"
+```
+
+`pollDelay()` currently extends the interval to `POLL_HIDDEN = 120 s` when the tab
+is hidden. Change this to pause polling entirely while hidden and resume immediately
+on `visibilitychange`.
+
+**Implementation:**
+
+```js
+// config.js — replace pollDelay() with:
+export async function waitForVisible(base) {
+  if (typeof document === "undefined") return sleep(base);
+  if (document.visibilityState === "visible") return sleep(base);
+  return new Promise(resolve => {
+    const handler = () => { document.removeEventListener("visibilitychange", handler); resolve(); };
+    document.addEventListener("visibilitychange", handler);
+  });
+}
+```
+
+Pages replace `await sleep(pollDelay(...))` with `await waitForVisible(base)`.
+When the user switches back to the tab, the next poll fires immediately rather
+than waiting up to 120 s for the backoff to expire.
+
+---
+
+## Expected impact
+
+| Change | Request reduction |
+|--------|------------------|
+| T1 (cache headers) | ~70% drop in /topics, /repos, /domains hits |
+| T2 (ETags) | ~80% payload reduction for unchanged list responses |
+| T3+T4 (deps endpoint) | 2 full summary calls removed per 60 s cycle |
+| T5 (consolidate index) | 4 loops → 1, reduces timer jitter and staggered load |
+| T6 (visibility pause) | Eliminates all background-tab traffic entirely |