Optimize dashboard overview loading

2026-06-06 00:42:00 +02:00
parent a412998c96
commit b340489d96
14 changed files with 990 additions and 88 deletions
--- a/api/main.py
+++ b/api/main.py
@@ -1,5 +1,6 @@
 import hashlib
 import os
+import time
 from contextlib import asynccontextmanager

 from fastapi import FastAPI
@@ -26,26 +27,37 @@ class ETagMiddleware(BaseHTTPMiddleware):
    """Add ETag + conditional-GET (304) support to all JSON GET responses."""

    async def dispatch(self, request: Request, call_next):
+        started = time.perf_counter()
        response = await call_next(request)
        if request.method != "GET":
+            response.headers["X-StateHub-Elapsed-Ms"] = f"{(time.perf_counter() - started) * 1000:.1f}"
            return response
        if "application/json" not in response.headers.get("content-type", ""):
+            response.headers["X-StateHub-Elapsed-Ms"] = f"{(time.perf_counter() - started) * 1000:.1f}"
            return response

        body_parts = []
        async for chunk in response.body_iterator:
            body_parts.append(chunk)
        body = b"".join(body_parts)
+        elapsed_ms = f"{(time.perf_counter() - started) * 1000:.1f}"

        etag = '"' + hashlib.md5(body, usedforsecurity=False).hexdigest() + '"'
        if request.headers.get("if-none-match") == etag:
            return StarletteResponse(
                status_code=304,
-                headers={"ETag": etag, "Cache-Control": "no-cache"},
+                headers={
+                    "ETag": etag,
+                    "Cache-Control": "no-cache",
+                    "X-StateHub-Elapsed-Ms": elapsed_ms,
+                    "X-StateHub-Response-Bytes": "0",
+                },
            )

        headers = {k: v for k, v in response.headers.items() if k.lower() != "content-length"}
        headers["ETag"] = etag
+        headers["X-StateHub-Elapsed-Ms"] = elapsed_ms
+        headers["X-StateHub-Response-Bytes"] = str(len(body))
        if not any(k.lower() == "cache-control" for k in headers):
            headers["Cache-Control"] = "no-cache"
        return StarletteResponse(
@@ -84,7 +96,7 @@ app.add_middleware(
    allow_origins=_cors_origins,
    allow_methods=["GET", "POST", "PATCH", "DELETE", "PUT"],
    allow_headers=["Content-Type", "If-None-Match"],
-    expose_headers=["ETag"],
+    expose_headers=["ETag", "X-StateHub-Elapsed-Ms", "X-StateHub-Response-Bytes", "X-StateHub-Cache"],
 )

 app.include_router(domains.router)
--- a/api/routers/state.py
+++ b/api/routers/state.py
@@ -1,7 +1,7 @@
 import time
 from datetime import datetime, timedelta, timezone

-from fastapi import APIRouter, Depends, Request
+from fastapi import APIRouter, Depends, Request, Response
 from fastapi.responses import JSONResponse
 from sqlalchemy import func, select, text
 from sqlalchemy.ext.asyncio import AsyncSession
@@ -17,6 +17,7 @@ from api.models.extension_point import ExtensionPoint
 from api.models.managed_repo import ManagedRepo
 from api.models.progress_event import ProgressEvent
 from api.models.sbom_entry import SBOMEntry
+from api.models.sbom_snapshot import SBOMSnapshot
 from api.models.task import Task, TaskPriority, TaskStatus
 from api.models.technical_debt import TechnicalDebt
 from api.models.topic import Topic, TopicStatus
@@ -26,6 +27,9 @@ from api.schemas.decision import DecisionRead
 from api.schemas.domain import DomainSummary
 from api.schemas.progress_event import ProgressEventRead
 from api.schemas.state import (
+    DashboardOverview,
+    DashboardSourceMeta,
+    DashboardWorkplanRow,
    DecisionTotals,
    NextStep,
    StateSummary,
@@ -38,6 +42,7 @@ from api.schemas.task import TaskRead
 from api.schemas.topic import TopicRead, TopicWithWorkstreams
 from api.schemas.workstream import WorkstreamRead, WorkstreamWithTaskCounts, WorkstreamWithDeps
 from api.schemas.workstream_dependency import WorkstreamDepStub
+from api.routers.workstreams import _workplan_index
 from api.task_status import TERMINAL_TASK_STATUSES, status_value
 from api.workplan_status import (
    CLOSED_WORKSTREAM_STATUSES,
@@ -51,17 +56,25 @@ router = APIRouter(prefix="/state", tags=["state"])
 _SUMMARY_CACHE: StateSummary | None = None
 _SUMMARY_CACHE_AT: float = 0.0
 _SUMMARY_TTL = 15.0
+_OVERVIEW_CACHE: DashboardOverview | None = None
+_OVERVIEW_CACHE_AT: float = 0.0
+_OVERVIEW_TTL = 10.0


@router.get("/summary", response_model=StateSummary)
 async def get_summary(
    request: Request,
+    response: Response,
    session: AsyncSession = Depends(get_session),
 ) -> StateSummary:
    global _SUMMARY_CACHE, _SUMMARY_CACHE_AT
    no_cache = "no-cache" in request.headers.get("cache-control", "")
    if not no_cache and _SUMMARY_CACHE is not None and (time.monotonic() - _SUMMARY_CACHE_AT) < _SUMMARY_TTL:
+        response.headers["X-StateHub-Cache"] = "hit"
+        response.headers["Cache-Control"] = "max-age=15, stale-while-revalidate=30"
        return _SUMMARY_CACHE
+    response.headers["X-StateHub-Cache"] = "miss"
+    response.headers["Cache-Control"] = "max-age=15, stale-while-revalidate=30"
    # Run all queries sequentially on one session.
    # AsyncSession does not support concurrent operations (no gather on same session).

@@ -362,6 +375,309 @@ async def get_summary(
    return result


+@router.get("/overview", response_model=DashboardOverview)
+async def get_overview(
+    request: Request,
+    response: Response,
+    session: AsyncSession = Depends(get_session),
+) -> DashboardOverview:
+    """Bounded dashboard overview read model.
+
+    This is intentionally narrower than /state/summary. The dashboard overview
+    needs counts, recent rows, and chart-ready workplan rows; it does not need
+    full task or workplan lists transferred to the browser on every poll.
+    """
+    global _OVERVIEW_CACHE, _OVERVIEW_CACHE_AT
+    no_cache = "no-cache" in request.headers.get("cache-control", "")
+    if not no_cache and _OVERVIEW_CACHE is not None and (time.monotonic() - _OVERVIEW_CACHE_AT) < _OVERVIEW_TTL:
+        response.headers["X-StateHub-Cache"] = "hit"
+        response.headers["Cache-Control"] = "max-age=10, stale-while-revalidate=30"
+        return _OVERVIEW_CACHE
+
+    response.headers["X-StateHub-Cache"] = "miss"
+    response.headers["Cache-Control"] = "max-age=10, stale-while-revalidate=30"
+    result = await _build_dashboard_overview(session)
+    _OVERVIEW_CACHE = result
+    _OVERVIEW_CACHE_AT = time.monotonic()
+    return result
+
+
+async def _build_dashboard_overview(session: AsyncSession) -> DashboardOverview:
+    topics_rows = await session.execute(
+        select(Topic)
+        .options(
+            selectinload(Topic.domain),
+            noload(Topic.workstreams),
+            noload(Topic.decisions),
+            noload(Topic.progress_events),
+        )
+        .where(Topic.status != TopicStatus.archived)
+        .order_by(Topic.created_at)
+    )
+    topics = list(topics_rows.scalars().all())
+    topic_map = {topic.id: topic for topic in topics}
+
+    workstream_rows = await session.execute(
+        select(Workstream)
+        .options(noload("*"))
+        .order_by(
+            Workstream.planning_priority.asc().nullslast(),
+            Workstream.planning_order.asc().nullslast(),
+            Workstream.updated_at.desc(),
+        )
+    )
+    workstreams_all = list(workstream_rows.scalars().all())
+
+    topic_workstreams: dict = {t.id: [] for t in topics}
+    for w in sorted(workstreams_all, key=lambda item: item.created_at):
+        if w.topic_id not in topic_workstreams:
+            continue
+        topic_workstreams[w.topic_id].append({
+            "id": w.id,
+            "slug": w.slug,
+            "title": w.title,
+            "status": w.status,
+            "owner": w.owner,
+            "due_date": w.due_date,
+        })
+
+    repo_rows = await session.execute(
+        select(ManagedRepo.id, ManagedRepo.slug, Domain.slug)
+        .join(Domain, Domain.id == ManagedRepo.domain_id)
+        .order_by(ManagedRepo.slug)
+    )
+    repo_map = {
+        repo_id: {"slug": repo_slug, "domain_slug": domain_slug}
+        for repo_id, repo_slug, domain_slug in repo_rows
+    }
+
+    task_counts_by_ws: dict = {}
+    task_statuses_per_ws: dict = {}
+    task_totals_by_status: dict[str, int] = {}
+    for ws_id, task_status, count in await session.execute(
+        select(Task.workstream_id, Task.status, func.count()).group_by(Task.workstream_id, Task.status)
+    ):
+        status = status_value(task_status)
+        task_counts_by_ws.setdefault(ws_id, {"done": 0, "progress": 0, "wait": 0, "todo": 0, "total": 0})
+        task_counts_by_ws[ws_id]["total"] += count
+        if status in {"done", "progress", "wait", "todo"}:
+            task_counts_by_ws[ws_id][status] += count
+        task_statuses_per_ws.setdefault(ws_id, []).extend([status] * count)
+        task_totals_by_status[status] = task_totals_by_status.get(status, 0) + count
+
+    open_ws = [
+        w for w in workstreams_all
+        if normalize_workstream_status(w.status) in OPEN_WORKSTREAM_STATUSES
+    ]
+    open_ws_ids = [w.id for w in open_ws]
+    dep_rows = []
+    if open_ws_ids:
+        dep_result = await session.execute(
+            select(WorkstreamDependency).where(
+                (WorkstreamDependency.from_workstream_id.in_(open_ws_ids))
+                | (WorkstreamDependency.to_workstream_id.in_(open_ws_ids))
+            )
+        )
+        dep_rows = list(dep_result.scalars().all())
+
+    ws_lookup = {w.id: w for w in workstreams_all}
+    workstream_flow = load_flow("workstream")
+    flow_engine = FlowEngine()
+    effective_status: dict = {}
+    for w in open_ws:
+        flow_obj = {
+            "status": w.status,
+            "workstation": w.status,
+            "tasks": [{"status": status} for status in task_statuses_per_ws.get(w.id, [])],
+            "dependencies": [
+                {"workstation": normalize_workstream_status(ws_lookup[d.to_workstream_id].status)}
+                for d in dep_rows
+                if d.from_workstream_id == w.id and d.to_workstream_id and d.to_workstream_id in ws_lookup
+            ],
+        }
+        flow_result = flow_engine.evaluate(flow_obj, workstream_flow)
+        effective_status[w.id] = "blocked" if flow_result.exit_blocked else normalize_workstream_status(w.status)
+
+    topic_counts = {r[0]: r[1] for r in await session.execute(
+        select(Topic.status, func.count()).group_by(Topic.status)
+    )}
+    ws_counts = {r[0]: r[1] for r in await session.execute(
+        select(Workstream.status, func.count()).group_by(Workstream.status)
+    )}
+    dec_counts = {r[0]: r[1] for r in await session.execute(
+        select(Decision.status, func.count()).group_by(Decision.status)
+    )}
+
+    totals = Totals(
+        topics=TopicTotals(
+            active=topic_counts.get(TopicStatus.active, 0),
+            paused=topic_counts.get(TopicStatus.paused, 0),
+            archived=topic_counts.get(TopicStatus.archived, 0),
+            total=sum(topic_counts.values()),
+        ),
+        workstreams=WorkstreamTotals(
+            proposed=ws_counts.get("proposed", 0),
+            ready=ws_counts.get("ready", 0) + ws_counts.get("todo", 0),
+            active=sum(1 for status in effective_status.values() if status == "active"),
+            blocked=sum(1 for status in effective_status.values() if status == "blocked"),
+            backlog=ws_counts.get("backlog", 0),
+            finished=(
+                ws_counts.get("finished", 0)
+                + ws_counts.get("completed", 0)
+                + ws_counts.get("accepted", 0)
+            ),
+            archived=ws_counts.get("archived", 0),
+            total=sum(ws_counts.values()),
+        ),
+        tasks=TaskTotals(
+            wait=task_totals_by_status.get("wait", 0),
+            todo=task_totals_by_status.get("todo", 0),
+            progress=task_totals_by_status.get("progress", 0),
+            done=task_totals_by_status.get("done", 0),
+            cancel=task_totals_by_status.get("cancel", 0),
+            total=sum(task_totals_by_status.values()),
+        ),
+        decisions=DecisionTotals(
+            open=dec_counts.get(DecisionStatus.open, 0),
+            resolved=dec_counts.get(DecisionStatus.resolved, 0),
+            escalated=dec_counts.get(DecisionStatus.escalated, 0),
+            superseded=dec_counts.get(DecisionStatus.superseded, 0),
+            total=sum(dec_counts.values()),
+        ),
+    )
+
+    blocking_rows = await session.execute(
+        select(Decision)
+        .where(Decision.decision_type == DecisionType.pending)
+        .where(Decision.status.in_([DecisionStatus.open, DecisionStatus.escalated]))
+        .order_by(Decision.deadline.asc().nullslast(), Decision.created_at)
+    )
+    blocking = list(blocking_rows.scalars().all())
+
+    waiting_rows = await session.execute(
+        select(Task).options(noload("*")).where(Task.status == TaskStatus.wait).order_by(Task.created_at)
+    )
+    waiting = list(waiting_rows.scalars().all())
+
+    recent_rows = await session.execute(
+        select(ProgressEvent).options(noload("*")).order_by(ProgressEvent.created_at.desc()).limit(20)
+    )
+    recent = list(recent_rows.scalars().all())
+
+    milestone_rows = await session.execute(
+        select(ProgressEvent)
+        .options(noload("*"))
+        .where(ProgressEvent.event_type == "milestone")
+        .where(ProgressEvent.summary.like("Project registered with State Hub:%"))
+        .order_by(ProgressEvent.created_at.desc())
+        .limit(500)
+    )
+    registration_milestones = list(milestone_rows.scalars().all())
+
+    contrib_type_counts = {r[0].value: r[1] for r in await session.execute(
+        select(Contribution.type, func.count()).group_by(Contribution.type)
+    )}
+    contrib_status_counts = {r[0].value: r[1] for r in await session.execute(
+        select(Contribution.status, func.count()).group_by(Contribution.status)
+    )}
+    contribution_counts = {**contrib_type_counts, **contrib_status_counts}
+
+    _COPYLEFT_PATS = ("GPL", "AGPL", "LGPL", "EUPL", "CDDL", "MPL")
+    all_direct_prod_rows = await session.execute(
+        select(SBOMEntry.license_spdx)
+        .where(SBOMEntry.is_direct.is_(True))
+        .where(SBOMEntry.is_dev.is_(False))
+    )
+    licence_risk_count = sum(
+        1 for (lic,) in all_direct_prod_rows.all()
+        if lic and any(pat in lic.upper() for pat in _COPYLEFT_PATS)
+    )
+
+    snapshot_count, package_total = (await session.execute(
+        select(
+            func.count(SBOMSnapshot.id),
+            func.coalesce(func.sum(SBOMSnapshot.entry_count), 0),
+        )
+    )).one()
+
+    open_cap_req_count = (await session.execute(
+        select(func.count()).select_from(CapabilityRequest).where(
+            CapabilityRequest.status.in_(["requested", "accepted", "in_progress", "ready_for_review"])
+        )
+    )).scalar() or 0
+
+    sources: dict[str, DashboardSourceMeta] = {}
+    try:
+        workplan_index = await _workplan_index(refresh=False, session=session)
+        workplan_map = workplan_index.get("workstreams", {})
+        index_meta = workplan_index.get("_meta", {})
+        sources["workplan_index"] = DashboardSourceMeta(
+            ok=not bool(index_meta.get("last_error")),
+            stale=bool(index_meta.get("stale")),
+            cache_age_seconds=index_meta.get("cache_age_seconds"),
+            refresh_in_progress=bool(index_meta.get("refresh_in_progress")),
+            error=index_meta.get("last_error"),
+        )
+    except Exception as exc:
+        workplan_map = {}
+        sources["workplan_index"] = DashboardSourceMeta(ok=False, error=str(exc))
+
+    workplan_rows: list[DashboardWorkplanRow] = []
+    for w in workstreams_all:
+        repo = repo_map.get(w.repo_id)
+        topic = topic_map.get(w.topic_id)
+        workplan = workplan_map.get(str(w.id), {})
+        counts = task_counts_by_ws.get(w.id, {"done": 0, "progress": 0, "wait": 0, "todo": 0, "total": 0})
+        workplan_rows.append(DashboardWorkplanRow(
+            id=w.id,
+            title=w.title,
+            status=normalize_workstream_status(w.status),
+            domain=repo["domain_slug"] if repo else (topic.domain_slug if topic else "unknown"),
+            repo_label=repo["slug"] if repo else workplan.get("repo_slug", "unassigned"),
+            workplan_filename=workplan.get("filename"),
+            workplan_relative_path=workplan.get("relative_path"),
+            workplan_archived=bool(workplan.get("archived", False)),
+            health_labels=workplan.get("health_labels", []),
+            href=f"./workstreams/{w.id}",
+            done=counts.get("done", 0),
+            progress=counts.get("progress", 0),
+            wait=counts.get("wait", 0),
+            todo=counts.get("todo", 0),
+            total=counts.get("total", 0),
+            created_at=w.created_at,
+            updated_at=w.updated_at,
+        ))
+
+    return DashboardOverview(
+        generated_at=datetime.now(tz=timezone.utc),
+        totals=totals,
+        topics=[
+            TopicWithWorkstreams(
+                **TopicRead.model_validate(t).model_dump(),
+                workstreams=topic_workstreams.get(t.id, []),
+            )
+            for t in topics
+        ],
+        blocking_decisions=[DecisionRead.model_validate(d) for d in blocking],
+        waiting_tasks=[TaskRead.model_validate(t) for t in waiting],
+        blocked_tasks=[TaskRead.model_validate(t) for t in waiting],
+        recent_progress=[ProgressEventRead.model_validate(e) for e in recent],
+        next_steps=await _derive_next_steps(session),
+        contribution_counts=contribution_counts,
+        licence_risk_count=licence_risk_count,
+        open_capability_requests=open_cap_req_count,
+        sbom_snapshot_count=int(snapshot_count or 0),
+        sbom_package_total=int(package_total or 0),
+        registration_milestones=[ProgressEventRead.model_validate(e) for e in registration_milestones],
+        workplan_rows=workplan_rows,
+        sources=sources,
+        diagnostics={
+            "workplan_row_count": len(workplan_rows),
+            "task_count_strategy": "grouped",
+        },
+    )
+
+
 async def _build_domain_summaries(session: AsyncSession) -> list[DomainSummary]:
    """Compute per-domain stats for the state summary."""
    domains_rows = await session.execute(
--- a/api/routers/tasks.py
+++ b/api/routers/tasks.py
@@ -9,7 +9,7 @@ from api.database import get_session
 from api.models.task import Task, TaskStatus
 from api.models.token_event import TokenEvent
 from api.models.workstream import Workstream
-from api.schemas.task import TaskCreate, TaskRead, TaskUpdate
+from api.schemas.task import TaskCountRead, TaskCreate, TaskRead, TaskUpdate
 from api.services.lifecycle import status_value, transition_task_status
 from api.task_status import normalize_task_status

@@ -24,6 +24,8 @@ async def list_tasks(
    needs_human: bool | None = Query(None),
    priority: str | None = None,
    due_date_before: date | None = None,
+    limit: int | None = Query(None, ge=1, le=5000),
+    offset: int = Query(0, ge=0),
    session: AsyncSession = Depends(get_session),
 ) -> list[Task]:
    q = select(Task)
@@ -40,10 +42,32 @@ async def list_tasks(
    if due_date_before is not None:
        q = q.where(Task.due_date <= due_date_before)
    q = q.order_by(Task.created_at)
+    if offset:
+        q = q.offset(offset)
+    if limit is not None:
+        q = q.limit(limit)
    result = await session.execute(q)
    return list(result.scalars().all())


+@router.get("/counts", response_model=list[TaskCountRead])
+async def count_tasks(
+    workstream_id: uuid.UUID | None = None,
+    status: str | None = None,
+    session: AsyncSession = Depends(get_session),
+) -> list[TaskCountRead]:
+    q = select(Task.workstream_id, Task.status, func.count()).group_by(Task.workstream_id, Task.status)
+    if workstream_id:
+        q = q.where(Task.workstream_id == workstream_id)
+    if status:
+        q = q.where(Task.status == TaskStatus(normalize_task_status(status)))
+    rows = await session.execute(q)
+    return [
+        TaskCountRead(workstream_id=ws_id, status=task_status, count=count)
+        for ws_id, task_status, count in rows
+    ]
+
+
@router.post("/", response_model=TaskRead, status_code=status.HTTP_201_CREATED)
 async def create_task(
    body: TaskCreate,
--- a/api/routers/workstreams.py
+++ b/api/routers/workstreams.py
@@ -3,6 +3,7 @@ import logging
 import uuid
 import socket
 import time
+from datetime import datetime, timezone
 from pathlib import Path
 from typing import Any

@@ -40,6 +41,8 @@ workplan_router = APIRouter(prefix="/workplans", tags=["workplans"])
 _INDEX_CACHE: dict[str, Any] | None = None
 _INDEX_CACHE_AT: float = 0.0
 _INDEX_TTL = 30.0
+_INDEX_REFRESH_TASK: asyncio.Task | None = None
+_INDEX_LAST_ERROR: str | None = None

 _LEGACY_OWNER = "state-hub.api"
 _COMPLETED_WORKSTREAM_EVENT = "org.statehub.workstream.completed"
@@ -170,16 +173,7 @@ async def _list_workstreams(
    return list(result.scalars().all())


-async def _workplan_index(
-    *,
-    refresh: bool,
-    session: AsyncSession,
-) -> dict[str, Any]:
-    """Map file-backed workplan ids to their local workplan filenames."""
-    global _INDEX_CACHE, _INDEX_CACHE_AT
-    if not refresh and _INDEX_CACHE is not None and (time.monotonic() - _INDEX_CACHE_AT) < _INDEX_TTL:
-        return _INDEX_CACHE
-
+async def _build_workplan_index(session: AsyncSession) -> dict[str, Any]:
    result = await session.execute(
        select(ManagedRepo).where(ManagedRepo.status == "active").order_by(ManagedRepo.slug)
    )
@@ -218,8 +212,78 @@ async def _workplan_index(
                    "needs_review": bool(review and review.needs_review),
                    "health_labels": ["needs_review"] if review and review.needs_review else [],
                }
-    _INDEX_CACHE = {"workplans": index, "workstreams": index}
+    return {"workplans": index, "workstreams": index}
+
+
+def _index_with_meta(*, stale: bool, refresh_in_progress: bool) -> dict[str, Any]:
+    age = time.monotonic() - _INDEX_CACHE_AT if _INDEX_CACHE_AT else None
+    return {
+        **(_INDEX_CACHE or {"workplans": {}, "workstreams": {}}),
+        "_meta": {
+            "generated_at": _INDEX_CACHE.get("_meta", {}).get("generated_at") if _INDEX_CACHE else None,
+            "stale": stale,
+            "cache_age_seconds": round(age, 3) if age is not None else None,
+            "refresh_in_progress": refresh_in_progress,
+            "last_error": _INDEX_LAST_ERROR,
+        },
+    }
+
+
+async def _refresh_workplan_index_background() -> None:
+    global _INDEX_CACHE, _INDEX_CACHE_AT, _INDEX_LAST_ERROR
+    from api.database import async_session_factory
+
+    try:
+        async with async_session_factory() as session:
+            index = await _build_workplan_index(session)
+        index["_meta"] = {
+            "generated_at": datetime.now(timezone.utc).isoformat(),
+            "stale": False,
+            "cache_age_seconds": 0.0,
+            "refresh_in_progress": False,
+            "last_error": None,
+        }
+        _INDEX_CACHE = index
+        _INDEX_CACHE_AT = time.monotonic()
+        _INDEX_LAST_ERROR = None
+    except Exception as exc:
+        _INDEX_LAST_ERROR = str(exc)
+
+
+def _ensure_index_refresh_started() -> None:
+    global _INDEX_REFRESH_TASK
+    if _INDEX_REFRESH_TASK is not None and not _INDEX_REFRESH_TASK.done():
+        return
+    _INDEX_REFRESH_TASK = asyncio.create_task(_refresh_workplan_index_background())
+
+
+async def _workplan_index(
+    *,
+    refresh: bool,
+    session: AsyncSession,
+) -> dict[str, Any]:
+    """Map file-backed workplan ids to their local workplan filenames."""
+    global _INDEX_CACHE, _INDEX_CACHE_AT, _INDEX_LAST_ERROR
+    cache_age = time.monotonic() - _INDEX_CACHE_AT if _INDEX_CACHE_AT else None
+    if not refresh and _INDEX_CACHE is not None and cache_age is not None and cache_age < _INDEX_TTL:
+        refresh_running = _INDEX_REFRESH_TASK is not None and not _INDEX_REFRESH_TASK.done()
+        return _index_with_meta(stale=False, refresh_in_progress=refresh_running)
+
+    if not refresh and _INDEX_CACHE is not None:
+        _ensure_index_refresh_started()
+        return _index_with_meta(stale=True, refresh_in_progress=True)
+
+    index = await _build_workplan_index(session)
+    index["_meta"] = {
+        "generated_at": datetime.now(timezone.utc).isoformat(),
+        "stale": False,
+        "cache_age_seconds": 0.0,
+        "refresh_in_progress": False,
+        "last_error": None,
+    }
+    _INDEX_CACHE = index
    _INDEX_CACHE_AT = time.monotonic()
+    _INDEX_LAST_ERROR = None
    return _INDEX_CACHE


--- a/api/schemas/state.py
+++ b/api/schemas/state.py
@@ -1,5 +1,6 @@
 import uuid
 from datetime import datetime
+from typing import Any

 from pydantic import BaseModel

@@ -84,3 +85,51 @@ class StateSummary(BaseModel):
    contribution_counts: dict[str, int] = {}
    licence_risk_count: int = 0
    open_capability_requests: int = 0
+
+
+class DashboardWorkplanRow(BaseModel):
+    id: uuid.UUID
+    title: str
+    status: str
+    domain: str = "unknown"
+    repo_label: str = "unassigned"
+    workplan_filename: str | None = None
+    workplan_relative_path: str | None = None
+    workplan_archived: bool = False
+    health_labels: list[str] = []
+    href: str
+    done: int = 0
+    progress: int = 0
+    wait: int = 0
+    todo: int = 0
+    total: int = 0
+    created_at: datetime
+    updated_at: datetime
+
+
+class DashboardSourceMeta(BaseModel):
+    ok: bool = True
+    stale: bool = False
+    cache_age_seconds: float | None = None
+    refresh_in_progress: bool = False
+    error: str | None = None
+
+
+class DashboardOverview(BaseModel):
+    generated_at: datetime
+    totals: Totals
+    topics: list[TopicWithWorkstreams]
+    blocking_decisions: list[DecisionRead]
+    waiting_tasks: list[TaskRead]
+    blocked_tasks: list[TaskRead] = []
+    recent_progress: list[ProgressEventRead]
+    next_steps: list[NextStep] = []
+    contribution_counts: dict[str, int] = {}
+    licence_risk_count: int = 0
+    open_capability_requests: int = 0
+    sbom_snapshot_count: int = 0
+    sbom_package_total: int = 0
+    registration_milestones: list[ProgressEventRead] = []
+    workplan_rows: list[DashboardWorkplanRow] = []
+    sources: dict[str, DashboardSourceMeta] = {}
+    diagnostics: dict[str, Any] = {}
--- a/api/schemas/task.py
+++ b/api/schemas/task.py
@@ -93,3 +93,9 @@ class TaskRead(TaskStatusMixin):
    parent_task_id: uuid.UUID | None = None
    created_at: datetime
    updated_at: datetime
+
+
+class TaskCountRead(TaskStatusMixin):
+    workstream_id: uuid.UUID
+    status: TaskStatus
+    count: int
--- a/dashboard/src/components/config.js
+++ b/dashboard/src/components/config.js
@@ -89,11 +89,23 @@ export async function waitForVisible(ms) {
 export async function apiFetch(path, options = {}) {
  const url = path.startsWith("http") ? path : `${API}${path}`;
  const timeout = options.timeout ?? FETCH_TIMEOUT;
-  const {timeout: _timeout, ...fetchOptions} = options;
+  const {timeout: _timeout, cache = "no-store", ...fetchOptions} = options;
  const ctrl = new AbortController();
-  const timer = setTimeout(() => ctrl.abort(), timeout);
+  let timedOut = false;
+  const timer = setTimeout(() => {
+    timedOut = true;
+    ctrl.abort();
+  }, timeout);
  try {
-    return await fetch(url, {cache: "no-store", ...fetchOptions, signal: ctrl.signal});
+    return await fetch(url, {cache, ...fetchOptions, signal: ctrl.signal});
+  } catch (error) {
+    if (timedOut || error?.name === "AbortError") {
+      const message = `Request timed out after ${Math.round(timeout / 1000)}s: ${url}`;
+      const timeoutError = new Error(message);
+      timeoutError.name = "TimeoutError";
+      throw timeoutError;
+    }
+    throw error;
  } finally {
    clearTimeout(timer);
  }
--- a/dashboard/src/docs/live-data.md
+++ b/dashboard/src/docs/live-data.md
@@ -10,7 +10,10 @@ All dashboard pages poll the State Hub API automatically. No manual refresh is e

 ## Poll interval

-Every page fetches fresh data from `http://127.0.0.1:8000` every **15 seconds** using an async generator loop. The previous data stays visible while the next request is in flight, so the UI never goes blank.
+Most live pages fetch fresh data from `http://127.0.0.1:8000` every **15 seconds**
+using an async generator loop. The overview page uses a heavier bounded read
+model and refreshes every **60 seconds**. The previous data stays visible while
+the next request is in flight, so the UI never goes blank.

 ---

@@ -21,6 +24,7 @@ The **●** dot in the top-right corner of each page shows the current connectio
 | Indicator | Meaning |
 |---|---|
 | **● Live · updated HH:MM:SS** | Last poll succeeded — data is current as of that time |
+| **● Stale · last successful update HH:MM:SS** | Last refresh failed, but cached page data is still visible |
 | **● Offline — run: `make api`** | API is unreachable — the dot turns red |

 The timestamp updates on every successful poll. If you see a time that is more than ~30 seconds in the past, the poll is stalled (browser tab backgrounded or network issue) — reloading the page resets the loop.
@@ -48,7 +52,7 @@ make api      # db + migrate + uvicorn (restarts if already running)

 | Page | Endpoints |
 |---|---|
-| Overview | `/state/summary` |
+| Overview | `/state/overview`, `/decisions/?decision_type=pending` |
 | Workplans | `/workplans/`, `/topics/`, `/state/summary` |
 | Decisions | `/decisions/?limit=500`, `/topics/` |
 | Progress | `/progress/?limit=500` |
@@ -57,4 +61,4 @@ All endpoints are read-only GET requests. The dashboard never writes to the API.

 ---

-*Poll interval: 15 s. Data is refreshed in the background — the page never reloads itself.*
+*Poll interval: 15 s for most pages, 60 s for Overview. Data is refreshed in the background — the page never reloads itself.*
--- a/dashboard/src/docs/overview.md
+++ b/dashboard/src/docs/overview.md
@@ -82,9 +82,13 @@ and summary.

 ## Data source

-Polls `GET /state/summary` every **15 seconds**. The workstream chart also polls
-`GET /workplans/`, `GET /tasks/?limit=2000`, `GET /topics/`, `GET /repos/`,
-and `GET /workplans/index` for repository grouping, task counts, and
-workplan filename tooltips. Blocking decisions are fetched separately via
-`GET /decisions/?decision_type=pending` and only re-fetched after a successful
-resolve action — this prevents the inline form from being wiped on every poll.
+Polls `GET /state/overview` every **60 seconds**. This endpoint is a bounded
+dashboard read model: it returns summary totals, recent activity, registration
+milestones, SBOM totals, and chart-ready workplan rows with task counts already
+aggregated server-side.
+
+The page keeps the last successful overview response visible if a refresh times
+out, and marks the view stale instead of clearing the dashboard. Blocking
+decisions are fetched separately via `GET /decisions/?decision_type=pending`
+and only re-fetched after a successful resolve action — this prevents the inline
+form from being wiped on every poll.
--- a/dashboard/src/index.md
+++ b/dashboard/src/index.md
@@ -14,11 +14,15 @@ import {
 ```

 ```js
-// Single polling loop — fetches all data in one Promise.all batch, backs off uniformly.
+// Single polling loop — loads one bounded overview read model and keeps
+// last-known-good data visible if a refresh times out.
 const pageState = (async function*() {
  let failures = 0;
+  let lastGood = null;
  while (true) {
-    let summary = {}, snapshots = [], totalPkgs = 0, milestones = [], wsAll = [], ok = false;
+    let nextState = lastGood
+      ? {...lastGood, ok: false, stale: true, error: null}
+      : {summary: {}, snapshots: [], snapshotCount: 0, totalPkgs: 0, milestones: [], wsAll: [], ok: false, stale: false, error: null, sources: {}, ts: new Date()};
    try {
      const loadJson = async (name, path, options = {}) => {
        const response = await apiFetch(path, options);
@@ -26,67 +30,71 @@ const pageState = (async function*() {
        return response.json();
      };

-      const [
-        summaryData,
-        snapList,
-        allEvents,
-        wsList,
-        taskList,
-        topicList,
-        repoList,
-        workplanIndex,
-      ] = await Promise.all([
-        loadJson("summary", "/state/summary", {timeout: 20_000}),
-        loadJson("sbom snapshots", "/sbom/snapshots/"),
-        loadJson("milestones", "/progress/?event_type=milestone&limit=500"),
-        loadJson("workplans", "/workplans/"),
-        loadJson("tasks", "/tasks/?limit=2000"),
-        loadJson("topics", "/topics/"),
-        loadJson("repos", "/repos/"),
-        loadJson("workplan index", "/workplans/index").catch(() => ({workplans: {}, workstreams: {}})),
-      ]);
+      const overview = await loadJson("overview", "/state/overview", {timeout: 20_000, cache: "reload"});

-      ok = true;
-      summary   = summaryData;
-      snapshots = snapList;
-      totalPkgs = snapshots.reduce((s, sn) => s + (sn.entry_count ?? 0), 0);
-      milestones = allEvents.filter(e => e.summary?.startsWith("Project registered with State Hub:"));
-      const workplanMap = workplanIndex.workstreams ?? {};
-      const topicMap = Object.fromEntries(topicList.map(t => [t.id, t]));
-      const repoMap  = Object.fromEntries(repoList.map(r => [r.id, r]));
-      const counts = {};
-      for (const t of taskList) {
-        const wid = t.workstream_id;
-        if (!counts[wid]) counts[wid] = {done: 0, progress: 0, wait: 0, todo: 0, total: 0};
-        counts[wid].total++;
-        if      (t.status === "done")     counts[wid].done++;
-        else if (t.status === "progress") counts[wid].progress++;
-        else if (t.status === "wait")     counts[wid].wait++;
-        else if (t.status === "todo")     counts[wid].todo++;
-      }
-      wsAll = wsList.map(w => {
-        const repo = repoMap[w.repo_id];
-        const topic = topicMap[w.topic_id];
-        const workplan = workplanMap[w.id] ?? {};
-        return {
+      const summaryData = {
+        generated_at: overview.generated_at,
+        totals: overview.totals ?? {},
+        topics: overview.topics ?? [],
+        blocking_decisions: overview.blocking_decisions ?? [],
+        waiting_tasks: overview.waiting_tasks ?? [],
+        blocked_tasks: overview.blocked_tasks ?? overview.waiting_tasks ?? [],
+        recent_progress: overview.recent_progress ?? [],
+        next_steps: overview.next_steps ?? [],
+        contribution_counts: overview.contribution_counts ?? {},
+        licence_risk_count: overview.licence_risk_count ?? 0,
+        open_capability_requests: overview.open_capability_requests ?? 0,
+      };
+
+      nextState = {
+        summary: summaryData,
+        snapshots: [],
+        snapshotCount: overview.sbom_snapshot_count ?? 0,
+        totalPkgs: overview.sbom_package_total ?? 0,
+        milestones: overview.registration_milestones ?? [],
+        wsAll: (overview.workplan_rows ?? []).map(w => ({
          ...w,
          status: normalizeWorkstreamStatus(w.status),
-          domain: repo?.domain_slug ?? topic?.domain_slug ?? "unknown",
-          repo_label: repo?.slug ?? workplan.repo_slug ?? "unassigned",
-          workplan_filename: workplan.filename ?? null,
-          workplan_relative_path: workplan.relative_path ?? null,
-          workplan_archived: workplan.archived ?? false,
-          health_labels: workplan.health_labels ?? [],
-          href: `./workstreams/${w.id}`,
-          ...(counts[w.id] ?? {done: 0, progress: 0, wait: 0, todo: 0, total: 0}),
-        };
-      });
+        })),
+        ok: true,
+        stale: false,
+        error: null,
+        sources: overview.sources ?? {},
+        ts: new Date(),
+      };
+      lastGood = nextState;
    } catch (e) {
-      summary = {error: `Dashboard data load failed: ${e?.message ?? String(e)}`};
+      const message = `Dashboard refresh failed: ${e?.message ?? String(e)}`;
+      if (lastGood) {
+        nextState = {
+          ...lastGood,
+          ok: false,
+          stale: true,
+          error: `${message}; showing last successful data from ${lastGood.ts?.toLocaleTimeString?.() ?? "previous refresh"}`,
+          summary: {
+            ...(lastGood.summary ?? {}),
+            error: `${message}; showing last successful data from ${lastGood.ts?.toLocaleTimeString?.() ?? "previous refresh"}`,
+          },
+        };
+      } else {
+        nextState = {
+          summary: {error: message},
+          snapshots: [],
+          snapshotCount: 0,
+          totalPkgs: 0,
+          milestones: [],
+          wsAll: [],
+          ok: false,
+          stale: false,
+          error: message,
+          sources: {},
+          ts: new Date(),
+        };
+      }
    }
-    failures = ok ? 0 : failures + 1;
-    yield {summary, snapshots, totalPkgs, milestones, wsAll, ok, ts: new Date()};
-    await waitForVisible(pollDelay({ok, base: POLL_HEAVY, failures}));
+    failures = nextState.ok ? 0 : failures + 1;
+    yield nextState;
+    await waitForVisible(pollDelay({ok: nextState.ok, base: POLL_HEAVY, failures}));
  }
 })();
 ```
@@ -94,6 +102,7 @@ const pageState = (async function*() {
 ```js
 const summary   = pageState.summary   ?? {};
 const _ok       = pageState.ok        ?? false;
+const _stale    = pageState.stale     ?? false;
 const _ts       = pageState.ts;
 const totals    = summary.totals      ?? {};
 const ws        = totals.workstreams  ?? {};
@@ -107,7 +116,7 @@ const wsAll     = pageState.wsAll     ?? [];
 // Kept separate from the main poll so in-progress form inputs aren't wiped every 60 s.
 const blockingDecisions = Mutable([]);
 const refreshDecisions = async () => {
-  const r = await fetch(`${API}/decisions/?decision_type=pending`).catch(() => null);
+  const r = await apiFetch("/decisions/?decision_type=pending", {timeout: 12_000}).catch(() => null);
  const all = r?.ok ? await r.json() : [];
  blockingDecisions.value = all.filter(d => ["open", "escalated"].includes(d.status));
 };
@@ -121,9 +130,11 @@ import {injectTocTop} from "./components/toc-sidebar.js";
 import {withDocHelp}   from "./components/doc-overlay.js";

 const _liveEl = html`<div class="live-indicator">
-  <span style="color:${_ok ? 'var(--theme-foreground-focus)' : 'red'}">●</span>
+  <span style="color:${_ok ? 'var(--theme-foreground-focus)' : _stale ? 'orange' : 'red'}">●</span>
  ${_ok
    ? `Live · updated ${_ts?.toLocaleTimeString()}`
+    : _stale
+    ? `Stale · last successful update ${_ts?.toLocaleTimeString()}`
    : html`<span style="color:red">Offline — run: <code>cd ~/state-hub && make api</code></span>`}
 </div>`;
 withDocHelp(_liveEl, "/docs/live-data");
@@ -346,6 +357,7 @@ const licenceRisk   = summary.licence_risk_count   ?? 0;
 const totalContribs = ["br","fr","ep","upr"].reduce((s, t) => s + (contribCounts[t] ?? 0), 0);
 const needsFollowUp = (contribCounts["submitted"] ?? 0) + (contribCounts["acknowledged"] ?? 0);
 const sbomSnaps     = pageState.snapshots ?? [];
+const sbomSnapCount = pageState.snapshotCount ?? sbomSnaps.length;
 const totalPkgs     = pageState.totalPkgs ?? 0;

 display(html`<div class="grid grid-cols-3" style="gap:1rem;margin-bottom:1.5rem">
@@ -362,7 +374,7 @@ display(html`<div class="grid grid-cols-3" style="gap:1rem;margin-bottom:1.5rem"
  <a class="card card-link ${licenceRisk > 0 ? 'warn' : ''}" href="./sbom">
    <h3>SBOM</h3>
    <p class="big-num">${totalPkgs.toLocaleString()}</p>
-    <small>${sbomSnaps.length} repo${sbomSnaps.length !== 1 ? "s" : ""} tracked · ${licenceRisk > 0 ? html`<span style="color:red">${licenceRisk} copyleft risks</span>` : html`<span style="color:green">✓ no copyleft</span>`}</small>
+    <small>${sbomSnapCount} snapshot${sbomSnapCount !== 1 ? "s" : ""} tracked · ${licenceRisk > 0 ? html`<span style="color:red">${licenceRisk} copyleft risks</span>` : html`<span style="color:green">✓ no copyleft</span>`}</small>
  </a>
 </div>`);
 ```
--- a/scripts/smoke_dashboard_load.sh
+++ b/scripts/smoke_dashboard_load.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+API_PORT="${API_PORT:-8012}"
+DASHBOARD_PORT="${DASHBOARD_PORT:-3012}"
+API_BASE="http://127.0.0.1:${API_PORT}"
+DASHBOARD_URL="http://127.0.0.1:${DASHBOARD_PORT}/?api_base=${API_BASE}"
+
+API_LOG="${API_LOG:-/tmp/statehub-api-${API_PORT}.log}"
+DASHBOARD_LOG="${DASHBOARD_LOG:-/tmp/statehub-dashboard-${DASHBOARD_PORT}.log}"
+OVERVIEW_JSON="${OVERVIEW_JSON:-/tmp/statehub-overview-${API_PORT}.json}"
+OVERVIEW_HEADERS="${OVERVIEW_HEADERS:-/tmp/statehub-overview-${API_PORT}.headers}"
+DASHBOARD_HTML="${DASHBOARD_HTML:-/tmp/statehub-dashboard-${DASHBOARD_PORT}.html}"
+
+rm -f "$API_LOG" "$DASHBOARD_LOG" "$OVERVIEW_JSON" "$OVERVIEW_HEADERS" "$DASHBOARD_HTML"
+
+.venv/bin/python -m uvicorn api.main:app --host 127.0.0.1 --port "$API_PORT" \
+  > "$API_LOG" 2>&1 &
+api_pid=$!
+
+(cd dashboard && npm run dev -- --host 127.0.0.1 --port "$DASHBOARD_PORT" \
+  > "$DASHBOARD_LOG" 2>&1) &
+dashboard_pid=$!
+
+cleanup() {
+  kill "$api_pid" "$dashboard_pid" 2>/dev/null || true
+}
+trap cleanup EXIT
+
+wait_for_url() {
+  local label="$1"
+  local url="$2"
+  local output="$3"
+  local attempts="${4:-40}"
+  local i
+  for i in $(seq 1 "$attempts"); do
+    if curl -fsS "$url" -o "$output" >/dev/null 2>&1; then
+      return 0
+    fi
+    sleep 1
+  done
+  echo "$label did not become ready: $url" >&2
+  return 1
+}
+
+if ! wait_for_url "API overview" "${API_BASE}/state/overview" "$OVERVIEW_JSON"; then
+  echo "API log:" >&2
+  tail -80 "$API_LOG" >&2 || true
+  exit 1
+fi
+
+if ! wait_for_url "Dashboard" "$DASHBOARD_URL" "$DASHBOARD_HTML"; then
+  echo "Dashboard log:" >&2
+  tail -80 "$DASHBOARD_LOG" >&2 || true
+  exit 1
+fi
+
+curl -sS -D "$OVERVIEW_HEADERS" -o "$OVERVIEW_JSON" \
+  -w "overview %{http_code} %{time_total} %{size_download}\n" \
+  "${API_BASE}/state/overview"
+printf "dashboard 200 %s\n" "$DASHBOARD_URL"
+wc -c "$OVERVIEW_JSON"
+grep -i "x-statehub" "$OVERVIEW_HEADERS" || true
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -56,8 +56,12 @@ def _truncate(_schema):
    # Reset in-process TTL caches so stale data from a previous test can't bleed through.
    _state_router._SUMMARY_CACHE = None
    _state_router._SUMMARY_CACHE_AT = 0.0
+    _state_router._OVERVIEW_CACHE = None
+    _state_router._OVERVIEW_CACHE_AT = 0.0
    _ws_router._INDEX_CACHE = None
    _ws_router._INDEX_CACHE_AT = 0.0
+    _ws_router._INDEX_REFRESH_TASK = None
+    _ws_router._INDEX_LAST_ERROR = None

    yield
    engine = sqlalchemy.create_engine(_SYNC_URL)
--- a/tests/test_routers_core.py
+++ b/tests/test_routers_core.py
@@ -226,6 +226,34 @@ class TestTasks:
        assert "High prio" in titles
        assert "Low prio" not in titles

+    async def test_list_pagination_and_counts(self, client):
+        await _create_domain(client)
+        topic = await _create_topic(client)
+        ws = await _create_workstream(client, topic["id"])
+        first = await _create_task(client, ws["id"], title="First")
+        second = await _create_task(client, ws["id"], title="Second")
+        third = await _create_task(client, ws["id"], title="Third")
+        await client.patch(f"/tasks/{second['id']}", json={"status": "progress"})
+        await client.patch(f"/tasks/{third['id']}", json={"status": "wait", "blocking_reason": "blocked"})
+
+        r = await client.get("/tasks/?limit=2")
+        assert r.status_code == 200
+        body = r.json()
+        assert len(body) == 2
+        assert body[0]["id"] == first["id"]
+        assert body[1]["id"] == second["id"]
+
+        r = await client.get("/tasks/?limit=1&offset=2")
+        assert r.status_code == 200
+        assert [task["id"] for task in r.json()] == [third["id"]]
+
+        r = await client.get(f"/tasks/counts?workstream_id={ws['id']}")
+        assert r.status_code == 200
+        counts = {(row["workstream_id"], row["status"]): row["count"] for row in r.json()}
+        assert counts[(ws["id"], "todo")] == 1
+        assert counts[(ws["id"], "progress")] == 1
+        assert counts[(ws["id"], "wait")] == 1
+
    @pytest.mark.parametrize("initial_status", ["proposed", "ready", "backlog"])
    async def test_task_start_activates_planning_workstream(self, client, initial_status):
        await _create_domain(client)
@@ -358,6 +386,34 @@ class TestStateSummary:
        assert summaries[blocked_ws["id"]]["blocked_reasons"][0]["id"] == "dependencies.all_complete"
        assert body["totals"]["workstreams"]["blocked"] == 1

+    async def test_overview_returns_chart_ready_rows(self, client):
+        await _create_domain(client)
+        topic = await _create_topic(client)
+        repo = await _create_repo(client)
+        ws = await _create_workstream(client, topic["id"], repo_id=repo["id"])
+        first = await _create_task(client, ws["id"], title="Todo")
+        second = await _create_task(client, ws["id"], title="Done")
+        await client.patch(f"/tasks/{second['id']}", json={"status": "done", "suppress_token_event": True})
+
+        r = await client.get("/state/overview")
+        assert r.status_code == 200
+        assert r.headers["x-statehub-cache"] == "miss"
+        body = r.json()
+
+        rows = {row["id"]: row for row in body["workplan_rows"]}
+        assert ws["id"] in rows
+        assert rows[ws["id"]]["repo_label"] == "test-repo"
+        assert rows[ws["id"]]["domain"] == "testdomain"
+        assert rows[ws["id"]]["todo"] == 1
+        assert rows[ws["id"]]["done"] == 1
+        assert rows[ws["id"]]["total"] == 2
+        assert body["totals"]["tasks"]["total"] == 2
+        assert body["diagnostics"]["task_count_strategy"] == "grouped"
+
+        r = await client.get("/state/overview")
+        assert r.status_code == 200
+        assert r.headers["x-statehub-cache"] == "hit"
+

 class TestFlowEndpoints:
    async def test_list_flow_definitions(self, client):
--- a/workplans/STATE-WP-0056-dashboard-loading-robustness.md
+++ b/workplans/STATE-WP-0056-dashboard-loading-robustness.md
@@ -0,0 +1,276 @@
+---
+id: STATE-WP-0056
+type: workplan
+title: "Dashboard Loading Robustness and Efficiency"
+domain: custodian
+repo: state-hub
+status: finished
+owner: codex
+topic_slug: custodian
+created: "2026-06-05"
+updated: "2026-06-05"
+state_hub_workstream_id: "28f9569c-937b-4b79-b46c-f6b1f83c09c3"
+---
+
+# Dashboard Loading Robustness and Efficiency
+
+## Summary
+
+Make the State Hub dashboard overview page faster and more resilient under
+normal polling. The current overview performs a broad concurrent fan-out of
+full-list API calls and treats most request failures as whole-page failures.
+This can surface frequent `Dashboard data load failed: The operation was
+aborted.` warnings when one call crosses the frontend timeout, even if the API
+eventually returns successfully.
+
+This work should reduce request count, payload size, and backend contention;
+preserve useful last-known data during partial failures; and give operators
+clearer diagnostics when a section is stale or unavailable.
+
+## Current Findings
+
+Inspection on 2026-06-05 found:
+
+- `dashboard/src/index.md` loads overview data with one eight-request
+  `Promise.all` batch.
+- `dashboard/src/components/config.js` aborts most `apiFetch` calls after
+  `12_000` ms.
+- A dashboard-style concurrent timing run produced several calls at or above the
+  default timeout: `/sbom/snapshots/`, `/repos/`, and `/workplans/index`.
+- The same endpoints can be much faster when called alone, which points to
+  contention and over-fetching rather than one permanently slow endpoint.
+- The overview calls `/tasks/?limit=2000`, but the tasks API currently ignores
+  `limit` and returns every task. In the observed run that response was roughly
+  2.1 MB just to compute per-workplan task counts.
+- `/state/summary` has a short in-process cache, but a cache miss still runs a
+  large amount of sequential database and Python-side aggregation work.
+- `/workplans/index` scans active repository workplan files and parses
+  frontmatter. It is cached, but concurrent dashboard loads can still wait on
+  the same expensive rebuild pattern.
+- Several API routes set cache headers, but the shared dashboard fetch helper
+  forces `cache: "no-store"` for every request.
+
+## Out of Scope
+
+- Replacing Observable Framework.
+- Redesigning the dashboard information architecture.
+- Adding authentication, authorization, or multi-user session handling.
+- Changing workplan file conventions.
+- Moving State Hub to a different database or deployment substrate.
+
+## T01 — Add Focused Dashboard Load Instrumentation
+
+```task
+id: STATE-WP-0056-T01
+status: done
+priority: high
+state_hub_task_id: "e5208053-0db1-4842-a221-c5289422677a"
+```
+
+Add enough timing and error visibility to confirm which overview calls are slow,
+aborted, or oversized during normal use.
+
+Implementation notes:
+
+- Add lightweight server-side timing logs or response headers for overview-hot
+  endpoints: `/state/summary`, `/workplans/`, `/tasks/`, `/topics/`, `/repos/`,
+  `/sbom/snapshots/`, `/progress/`, and `/workplans/index`.
+- Include request path, status, elapsed time, response size when practical, and
+  whether a cached result was used.
+- Keep instrumentation local and low-noise; avoid logging full payloads or
+  secrets.
+- Add a small dashboard diagnostic surface or console logging that distinguishes
+  timeout aborts from HTTP errors and network failures.
+- Capture before/after timing notes in this workplan or a progress event.
+
+Done when a normal dashboard refresh can be diagnosed without manually timing
+each endpoint from a shell.
+
+## T02 — Make Overview Polling Partially Resilient
+
+```task
+id: STATE-WP-0056-T02
+status: done
+priority: high
+state_hub_task_id: "2cdd960d-ba86-48d1-a7c6-e83671cd0e69"
+```
+
+Change the overview data loader so one slow or failed secondary request does
+not mark the whole dashboard as failed.
+
+Implementation notes:
+
+- Replace fail-fast `Promise.all` behavior in `dashboard/src/index.md` with a
+  per-resource result model, for example `Promise.allSettled`.
+- Keep last-known-good data for each section while a refresh is degraded.
+- Treat optional resources such as SBOM snapshots, registration milestones, and
+  workplan file metadata independently from core summary/workplan status data.
+- Display section-level stale/error indicators instead of one global warning
+  whenever possible.
+- Keep exponential backoff for repeated failures, but do not discard usable
+  data just because one request timed out.
+- Make abort errors user-readable, for example "timed out after 12s" instead of
+  only "The operation was aborted."
+
+Done when an SBOM, repo-list, or workplan-index timeout leaves the rest of the
+overview usable and visibly stale rather than failed.
+
+## T03 — Respect Pagination and Add Task Count Aggregates
+
+```task
+id: STATE-WP-0056-T03
+status: done
+priority: high
+state_hub_task_id: "78484226-9ccc-460c-a2b3-750b3204caa3"
+```
+
+Stop returning all tasks for overview count calculations.
+
+Implementation notes:
+
+- Add `limit` and `offset` support to `GET /tasks/`, preserving existing filter
+  behavior and sensible limits.
+- Add a lightweight aggregate endpoint for task counts by workplan and status,
+  for example `GET /tasks/counts?group_by=workstream,status`, or add an
+  overview-specific aggregate route.
+- Prefer SQL `GROUP BY` over transferring every task to the browser.
+- Update `dashboard/src/index.md`, `dashboard/src/tasks.md`,
+  `dashboard/src/interventions.md`, and workplan detail pages as needed so list
+  views still receive the rows they need.
+- Add tests for pagination compatibility and aggregate counts.
+
+Done when the overview no longer fetches the full task table to draw the
+workplan chart.
+
+## T04 — Build a Lightweight Overview Read Endpoint
+
+```task
+id: STATE-WP-0056-T04
+status: done
+priority: high
+state_hub_task_id: "2cf47a12-e8aa-49ca-963c-1f0d2933c344"
+```
+
+Create a dashboard-specific read model that returns exactly the data needed by
+the overview page in one bounded response.
+
+Implementation notes:
+
+- Add an endpoint such as `GET /state/overview` or
+  `GET /state/dashboard-overview`.
+- Include summary totals, recent progress needed by the page, blocking decision
+  counts, waiting-task counts, SBOM snapshot totals, registration milestones,
+  and workplan chart rows with repo/domain labels and task counts.
+- Keep response fields stable and documented in dashboard reference docs.
+- Reuse existing summary helpers where they are efficient, but avoid serializing
+  large full-list payloads that the overview does not display directly.
+- Add cache headers and a short in-process cache with explicit invalidation
+  rules where appropriate.
+- Update `dashboard/src/index.md` to prefer this endpoint and remove redundant
+  overview-only fetches.
+
+Done when the overview's steady-state refresh is one bounded API call plus only
+truly interactive secondary calls.
+
+## T05 — Add Stale-While-Refresh for File-Backed Workplan Index
+
+```task
+id: STATE-WP-0056-T05
+status: done
+priority: medium
+state_hub_task_id: "0c88c1a2-588b-41f8-bc1c-f94c8b4b0d1a"
+```
+
+Make `/workplans/index` resilient when repository filesystem scans are slow.
+
+Implementation notes:
+
+- Add singleflight behavior so concurrent requests share one in-progress
+  rebuild instead of starting or waiting on redundant scans.
+- Return stale cached data quickly while a background refresh runs when the
+  cache is expired but still available.
+- Include metadata such as `generated_at`, `stale`, `cache_age_seconds`, and
+  optionally `refresh_in_progress`.
+- Consider reading only frontmatter rather than whole markdown files if this
+  can be done cleanly.
+- Keep `refresh=true` as an explicit operator escape hatch.
+- Add tests for cache hit, stale return, and forced refresh behavior.
+
+Done when a slow filesystem scan cannot block normal dashboard refreshes for
+longer than the frontend timeout if cached data exists.
+
+## T06 — Use Browser and HTTP Caching Selectively
+
+```task
+id: STATE-WP-0056-T06
+status: done
+priority: medium
+state_hub_task_id: "811f02ff-2e92-4c82-8b8a-e3d39a450b02"
+```
+
+Let stable lookup requests benefit from cache headers instead of forcing every
+dashboard request to bypass caches.
+
+Implementation notes:
+
+- Extend `apiFetch` so callers can choose cache mode.
+- Keep `no-store` for volatile mutation-sensitive resources.
+- Use default browser caching or `reload` only where route cache headers are
+  already intentional, such as repo/topic lookup data.
+- Review current route cache headers and align them with dashboard polling
+  needs.
+- Avoid stale cached data for controls that immediately follow a mutation.
+
+Done when stable overview lookup data no longer bypasses useful cache headers
+by default.
+
+## T07 — Optimize `/state/summary` Cache Misses
+
+```task
+id: STATE-WP-0056-T07
+status: done
+priority: medium
+state_hub_task_id: "633f4cc6-ffeb-4086-9858-d239f50a9686"
+```
+
+Reduce the cost of a cold or expired `/state/summary` request.
+
+Implementation notes:
+
+- Profile the current sequential query groups in `api/routers/state.py`.
+- Move Python-side counts and scans into SQL where straightforward.
+- Remove unused work from the summary path, such as dead intermediate query
+  results.
+- Cache derived sections independently when their freshness requirements differ.
+- Add indexes only after profiling shows a query plan needs them.
+- Keep summary response compatibility for existing consumers and MCP smoke
+  tests.
+
+Done when a summary cache miss stays comfortably below the frontend timeout
+under the current local data volume.
+
+## T08 — Verify Under Dashboard-Style Load
+
+```task
+id: STATE-WP-0056-T08
+status: done
+priority: high
+state_hub_task_id: "353fb25a-5306-416b-8d6d-9b201e6fac87"
+```
+
+Prove the dashboard no longer produces frequent abort warnings under realistic
+refresh behavior.
+
+Implementation notes:
+
+- Add or document a repeatable script that performs dashboard-style concurrent
+  endpoint timing before and after the changes.
+- Run API tests and dashboard component tests.
+- Open the dashboard locally and verify that initial load, refresh, hidden-tab
+  pause/resume, and partial API failure states behave correctly.
+- Confirm payload sizes are lower than the baseline for the overview page.
+- Update `dashboard/src/docs/overview.md` and `dashboard/src/docs/live-data.md`
+  with the new data-loading model.
+
+Done when repeated dashboard refreshes do not show the global aborted-operation
+warning during normal local operation, and degraded sections recover cleanly.