diff --git a/api/main.py b/api/main.py index 552e68a..d693183 100644 --- a/api/main.py +++ b/api/main.py @@ -1,5 +1,6 @@ import hashlib import os +import time from contextlib import asynccontextmanager from fastapi import FastAPI @@ -26,26 +27,37 @@ class ETagMiddleware(BaseHTTPMiddleware): """Add ETag + conditional-GET (304) support to all JSON GET responses.""" async def dispatch(self, request: Request, call_next): + started = time.perf_counter() response = await call_next(request) if request.method != "GET": + response.headers["X-StateHub-Elapsed-Ms"] = f"{(time.perf_counter() - started) * 1000:.1f}" return response if "application/json" not in response.headers.get("content-type", ""): + response.headers["X-StateHub-Elapsed-Ms"] = f"{(time.perf_counter() - started) * 1000:.1f}" return response body_parts = [] async for chunk in response.body_iterator: body_parts.append(chunk) body = b"".join(body_parts) + elapsed_ms = f"{(time.perf_counter() - started) * 1000:.1f}" etag = '"' + hashlib.md5(body, usedforsecurity=False).hexdigest() + '"' if request.headers.get("if-none-match") == etag: return StarletteResponse( status_code=304, - headers={"ETag": etag, "Cache-Control": "no-cache"}, + headers={ + "ETag": etag, + "Cache-Control": "no-cache", + "X-StateHub-Elapsed-Ms": elapsed_ms, + "X-StateHub-Response-Bytes": "0", + }, ) headers = {k: v for k, v in response.headers.items() if k.lower() != "content-length"} headers["ETag"] = etag + headers["X-StateHub-Elapsed-Ms"] = elapsed_ms + headers["X-StateHub-Response-Bytes"] = str(len(body)) if not any(k.lower() == "cache-control" for k in headers): headers["Cache-Control"] = "no-cache" return StarletteResponse( @@ -84,7 +96,7 @@ app.add_middleware( allow_origins=_cors_origins, allow_methods=["GET", "POST", "PATCH", "DELETE", "PUT"], allow_headers=["Content-Type", "If-None-Match"], - expose_headers=["ETag"], + expose_headers=["ETag", "X-StateHub-Elapsed-Ms", "X-StateHub-Response-Bytes", "X-StateHub-Cache"], ) app.include_router(domains.router) diff --git a/api/routers/state.py b/api/routers/state.py index a214e87..6d2a1c1 100644 --- a/api/routers/state.py +++ b/api/routers/state.py @@ -1,7 +1,7 @@ import time from datetime import datetime, timedelta, timezone -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends, Request, Response from fastapi.responses import JSONResponse from sqlalchemy import func, select, text from sqlalchemy.ext.asyncio import AsyncSession @@ -17,6 +17,7 @@ from api.models.extension_point import ExtensionPoint from api.models.managed_repo import ManagedRepo from api.models.progress_event import ProgressEvent from api.models.sbom_entry import SBOMEntry +from api.models.sbom_snapshot import SBOMSnapshot from api.models.task import Task, TaskPriority, TaskStatus from api.models.technical_debt import TechnicalDebt from api.models.topic import Topic, TopicStatus @@ -26,6 +27,9 @@ from api.schemas.decision import DecisionRead from api.schemas.domain import DomainSummary from api.schemas.progress_event import ProgressEventRead from api.schemas.state import ( + DashboardOverview, + DashboardSourceMeta, + DashboardWorkplanRow, DecisionTotals, NextStep, StateSummary, @@ -38,6 +42,7 @@ from api.schemas.task import TaskRead from api.schemas.topic import TopicRead, TopicWithWorkstreams from api.schemas.workstream import WorkstreamRead, WorkstreamWithTaskCounts, WorkstreamWithDeps from api.schemas.workstream_dependency import WorkstreamDepStub +from api.routers.workstreams import _workplan_index from api.task_status import TERMINAL_TASK_STATUSES, status_value from api.workplan_status import ( CLOSED_WORKSTREAM_STATUSES, @@ -51,17 +56,25 @@ router = APIRouter(prefix="/state", tags=["state"]) _SUMMARY_CACHE: StateSummary | None = None _SUMMARY_CACHE_AT: float = 0.0 _SUMMARY_TTL = 15.0 +_OVERVIEW_CACHE: DashboardOverview | None = None +_OVERVIEW_CACHE_AT: float = 0.0 +_OVERVIEW_TTL = 10.0 @router.get("/summary", response_model=StateSummary) async def get_summary( request: Request, + response: Response, session: AsyncSession = Depends(get_session), ) -> StateSummary: global _SUMMARY_CACHE, _SUMMARY_CACHE_AT no_cache = "no-cache" in request.headers.get("cache-control", "") if not no_cache and _SUMMARY_CACHE is not None and (time.monotonic() - _SUMMARY_CACHE_AT) < _SUMMARY_TTL: + response.headers["X-StateHub-Cache"] = "hit" + response.headers["Cache-Control"] = "max-age=15, stale-while-revalidate=30" return _SUMMARY_CACHE + response.headers["X-StateHub-Cache"] = "miss" + response.headers["Cache-Control"] = "max-age=15, stale-while-revalidate=30" # Run all queries sequentially on one session. # AsyncSession does not support concurrent operations (no gather on same session). @@ -362,6 +375,309 @@ async def get_summary( return result +@router.get("/overview", response_model=DashboardOverview) +async def get_overview( + request: Request, + response: Response, + session: AsyncSession = Depends(get_session), +) -> DashboardOverview: + """Bounded dashboard overview read model. + + This is intentionally narrower than /state/summary. The dashboard overview + needs counts, recent rows, and chart-ready workplan rows; it does not need + full task or workplan lists transferred to the browser on every poll. + """ + global _OVERVIEW_CACHE, _OVERVIEW_CACHE_AT + no_cache = "no-cache" in request.headers.get("cache-control", "") + if not no_cache and _OVERVIEW_CACHE is not None and (time.monotonic() - _OVERVIEW_CACHE_AT) < _OVERVIEW_TTL: + response.headers["X-StateHub-Cache"] = "hit" + response.headers["Cache-Control"] = "max-age=10, stale-while-revalidate=30" + return _OVERVIEW_CACHE + + response.headers["X-StateHub-Cache"] = "miss" + response.headers["Cache-Control"] = "max-age=10, stale-while-revalidate=30" + result = await _build_dashboard_overview(session) + _OVERVIEW_CACHE = result + _OVERVIEW_CACHE_AT = time.monotonic() + return result + + +async def _build_dashboard_overview(session: AsyncSession) -> DashboardOverview: + topics_rows = await session.execute( + select(Topic) + .options( + selectinload(Topic.domain), + noload(Topic.workstreams), + noload(Topic.decisions), + noload(Topic.progress_events), + ) + .where(Topic.status != TopicStatus.archived) + .order_by(Topic.created_at) + ) + topics = list(topics_rows.scalars().all()) + topic_map = {topic.id: topic for topic in topics} + + workstream_rows = await session.execute( + select(Workstream) + .options(noload("*")) + .order_by( + Workstream.planning_priority.asc().nullslast(), + Workstream.planning_order.asc().nullslast(), + Workstream.updated_at.desc(), + ) + ) + workstreams_all = list(workstream_rows.scalars().all()) + + topic_workstreams: dict = {t.id: [] for t in topics} + for w in sorted(workstreams_all, key=lambda item: item.created_at): + if w.topic_id not in topic_workstreams: + continue + topic_workstreams[w.topic_id].append({ + "id": w.id, + "slug": w.slug, + "title": w.title, + "status": w.status, + "owner": w.owner, + "due_date": w.due_date, + }) + + repo_rows = await session.execute( + select(ManagedRepo.id, ManagedRepo.slug, Domain.slug) + .join(Domain, Domain.id == ManagedRepo.domain_id) + .order_by(ManagedRepo.slug) + ) + repo_map = { + repo_id: {"slug": repo_slug, "domain_slug": domain_slug} + for repo_id, repo_slug, domain_slug in repo_rows + } + + task_counts_by_ws: dict = {} + task_statuses_per_ws: dict = {} + task_totals_by_status: dict[str, int] = {} + for ws_id, task_status, count in await session.execute( + select(Task.workstream_id, Task.status, func.count()).group_by(Task.workstream_id, Task.status) + ): + status = status_value(task_status) + task_counts_by_ws.setdefault(ws_id, {"done": 0, "progress": 0, "wait": 0, "todo": 0, "total": 0}) + task_counts_by_ws[ws_id]["total"] += count + if status in {"done", "progress", "wait", "todo"}: + task_counts_by_ws[ws_id][status] += count + task_statuses_per_ws.setdefault(ws_id, []).extend([status] * count) + task_totals_by_status[status] = task_totals_by_status.get(status, 0) + count + + open_ws = [ + w for w in workstreams_all + if normalize_workstream_status(w.status) in OPEN_WORKSTREAM_STATUSES + ] + open_ws_ids = [w.id for w in open_ws] + dep_rows = [] + if open_ws_ids: + dep_result = await session.execute( + select(WorkstreamDependency).where( + (WorkstreamDependency.from_workstream_id.in_(open_ws_ids)) + | (WorkstreamDependency.to_workstream_id.in_(open_ws_ids)) + ) + ) + dep_rows = list(dep_result.scalars().all()) + + ws_lookup = {w.id: w for w in workstreams_all} + workstream_flow = load_flow("workstream") + flow_engine = FlowEngine() + effective_status: dict = {} + for w in open_ws: + flow_obj = { + "status": w.status, + "workstation": w.status, + "tasks": [{"status": status} for status in task_statuses_per_ws.get(w.id, [])], + "dependencies": [ + {"workstation": normalize_workstream_status(ws_lookup[d.to_workstream_id].status)} + for d in dep_rows + if d.from_workstream_id == w.id and d.to_workstream_id and d.to_workstream_id in ws_lookup + ], + } + flow_result = flow_engine.evaluate(flow_obj, workstream_flow) + effective_status[w.id] = "blocked" if flow_result.exit_blocked else normalize_workstream_status(w.status) + + topic_counts = {r[0]: r[1] for r in await session.execute( + select(Topic.status, func.count()).group_by(Topic.status) + )} + ws_counts = {r[0]: r[1] for r in await session.execute( + select(Workstream.status, func.count()).group_by(Workstream.status) + )} + dec_counts = {r[0]: r[1] for r in await session.execute( + select(Decision.status, func.count()).group_by(Decision.status) + )} + + totals = Totals( + topics=TopicTotals( + active=topic_counts.get(TopicStatus.active, 0), + paused=topic_counts.get(TopicStatus.paused, 0), + archived=topic_counts.get(TopicStatus.archived, 0), + total=sum(topic_counts.values()), + ), + workstreams=WorkstreamTotals( + proposed=ws_counts.get("proposed", 0), + ready=ws_counts.get("ready", 0) + ws_counts.get("todo", 0), + active=sum(1 for status in effective_status.values() if status == "active"), + blocked=sum(1 for status in effective_status.values() if status == "blocked"), + backlog=ws_counts.get("backlog", 0), + finished=( + ws_counts.get("finished", 0) + + ws_counts.get("completed", 0) + + ws_counts.get("accepted", 0) + ), + archived=ws_counts.get("archived", 0), + total=sum(ws_counts.values()), + ), + tasks=TaskTotals( + wait=task_totals_by_status.get("wait", 0), + todo=task_totals_by_status.get("todo", 0), + progress=task_totals_by_status.get("progress", 0), + done=task_totals_by_status.get("done", 0), + cancel=task_totals_by_status.get("cancel", 0), + total=sum(task_totals_by_status.values()), + ), + decisions=DecisionTotals( + open=dec_counts.get(DecisionStatus.open, 0), + resolved=dec_counts.get(DecisionStatus.resolved, 0), + escalated=dec_counts.get(DecisionStatus.escalated, 0), + superseded=dec_counts.get(DecisionStatus.superseded, 0), + total=sum(dec_counts.values()), + ), + ) + + blocking_rows = await session.execute( + select(Decision) + .where(Decision.decision_type == DecisionType.pending) + .where(Decision.status.in_([DecisionStatus.open, DecisionStatus.escalated])) + .order_by(Decision.deadline.asc().nullslast(), Decision.created_at) + ) + blocking = list(blocking_rows.scalars().all()) + + waiting_rows = await session.execute( + select(Task).options(noload("*")).where(Task.status == TaskStatus.wait).order_by(Task.created_at) + ) + waiting = list(waiting_rows.scalars().all()) + + recent_rows = await session.execute( + select(ProgressEvent).options(noload("*")).order_by(ProgressEvent.created_at.desc()).limit(20) + ) + recent = list(recent_rows.scalars().all()) + + milestone_rows = await session.execute( + select(ProgressEvent) + .options(noload("*")) + .where(ProgressEvent.event_type == "milestone") + .where(ProgressEvent.summary.like("Project registered with State Hub:%")) + .order_by(ProgressEvent.created_at.desc()) + .limit(500) + ) + registration_milestones = list(milestone_rows.scalars().all()) + + contrib_type_counts = {r[0].value: r[1] for r in await session.execute( + select(Contribution.type, func.count()).group_by(Contribution.type) + )} + contrib_status_counts = {r[0].value: r[1] for r in await session.execute( + select(Contribution.status, func.count()).group_by(Contribution.status) + )} + contribution_counts = {**contrib_type_counts, **contrib_status_counts} + + _COPYLEFT_PATS = ("GPL", "AGPL", "LGPL", "EUPL", "CDDL", "MPL") + all_direct_prod_rows = await session.execute( + select(SBOMEntry.license_spdx) + .where(SBOMEntry.is_direct.is_(True)) + .where(SBOMEntry.is_dev.is_(False)) + ) + licence_risk_count = sum( + 1 for (lic,) in all_direct_prod_rows.all() + if lic and any(pat in lic.upper() for pat in _COPYLEFT_PATS) + ) + + snapshot_count, package_total = (await session.execute( + select( + func.count(SBOMSnapshot.id), + func.coalesce(func.sum(SBOMSnapshot.entry_count), 0), + ) + )).one() + + open_cap_req_count = (await session.execute( + select(func.count()).select_from(CapabilityRequest).where( + CapabilityRequest.status.in_(["requested", "accepted", "in_progress", "ready_for_review"]) + ) + )).scalar() or 0 + + sources: dict[str, DashboardSourceMeta] = {} + try: + workplan_index = await _workplan_index(refresh=False, session=session) + workplan_map = workplan_index.get("workstreams", {}) + index_meta = workplan_index.get("_meta", {}) + sources["workplan_index"] = DashboardSourceMeta( + ok=not bool(index_meta.get("last_error")), + stale=bool(index_meta.get("stale")), + cache_age_seconds=index_meta.get("cache_age_seconds"), + refresh_in_progress=bool(index_meta.get("refresh_in_progress")), + error=index_meta.get("last_error"), + ) + except Exception as exc: + workplan_map = {} + sources["workplan_index"] = DashboardSourceMeta(ok=False, error=str(exc)) + + workplan_rows: list[DashboardWorkplanRow] = [] + for w in workstreams_all: + repo = repo_map.get(w.repo_id) + topic = topic_map.get(w.topic_id) + workplan = workplan_map.get(str(w.id), {}) + counts = task_counts_by_ws.get(w.id, {"done": 0, "progress": 0, "wait": 0, "todo": 0, "total": 0}) + workplan_rows.append(DashboardWorkplanRow( + id=w.id, + title=w.title, + status=normalize_workstream_status(w.status), + domain=repo["domain_slug"] if repo else (topic.domain_slug if topic else "unknown"), + repo_label=repo["slug"] if repo else workplan.get("repo_slug", "unassigned"), + workplan_filename=workplan.get("filename"), + workplan_relative_path=workplan.get("relative_path"), + workplan_archived=bool(workplan.get("archived", False)), + health_labels=workplan.get("health_labels", []), + href=f"./workstreams/{w.id}", + done=counts.get("done", 0), + progress=counts.get("progress", 0), + wait=counts.get("wait", 0), + todo=counts.get("todo", 0), + total=counts.get("total", 0), + created_at=w.created_at, + updated_at=w.updated_at, + )) + + return DashboardOverview( + generated_at=datetime.now(tz=timezone.utc), + totals=totals, + topics=[ + TopicWithWorkstreams( + **TopicRead.model_validate(t).model_dump(), + workstreams=topic_workstreams.get(t.id, []), + ) + for t in topics + ], + blocking_decisions=[DecisionRead.model_validate(d) for d in blocking], + waiting_tasks=[TaskRead.model_validate(t) for t in waiting], + blocked_tasks=[TaskRead.model_validate(t) for t in waiting], + recent_progress=[ProgressEventRead.model_validate(e) for e in recent], + next_steps=await _derive_next_steps(session), + contribution_counts=contribution_counts, + licence_risk_count=licence_risk_count, + open_capability_requests=open_cap_req_count, + sbom_snapshot_count=int(snapshot_count or 0), + sbom_package_total=int(package_total or 0), + registration_milestones=[ProgressEventRead.model_validate(e) for e in registration_milestones], + workplan_rows=workplan_rows, + sources=sources, + diagnostics={ + "workplan_row_count": len(workplan_rows), + "task_count_strategy": "grouped", + }, + ) + + async def _build_domain_summaries(session: AsyncSession) -> list[DomainSummary]: """Compute per-domain stats for the state summary.""" domains_rows = await session.execute( diff --git a/api/routers/tasks.py b/api/routers/tasks.py index 62cd134..45a5d67 100644 --- a/api/routers/tasks.py +++ b/api/routers/tasks.py @@ -9,7 +9,7 @@ from api.database import get_session from api.models.task import Task, TaskStatus from api.models.token_event import TokenEvent from api.models.workstream import Workstream -from api.schemas.task import TaskCreate, TaskRead, TaskUpdate +from api.schemas.task import TaskCountRead, TaskCreate, TaskRead, TaskUpdate from api.services.lifecycle import status_value, transition_task_status from api.task_status import normalize_task_status @@ -24,6 +24,8 @@ async def list_tasks( needs_human: bool | None = Query(None), priority: str | None = None, due_date_before: date | None = None, + limit: int | None = Query(None, ge=1, le=5000), + offset: int = Query(0, ge=0), session: AsyncSession = Depends(get_session), ) -> list[Task]: q = select(Task) @@ -40,10 +42,32 @@ async def list_tasks( if due_date_before is not None: q = q.where(Task.due_date <= due_date_before) q = q.order_by(Task.created_at) + if offset: + q = q.offset(offset) + if limit is not None: + q = q.limit(limit) result = await session.execute(q) return list(result.scalars().all()) +@router.get("/counts", response_model=list[TaskCountRead]) +async def count_tasks( + workstream_id: uuid.UUID | None = None, + status: str | None = None, + session: AsyncSession = Depends(get_session), +) -> list[TaskCountRead]: + q = select(Task.workstream_id, Task.status, func.count()).group_by(Task.workstream_id, Task.status) + if workstream_id: + q = q.where(Task.workstream_id == workstream_id) + if status: + q = q.where(Task.status == TaskStatus(normalize_task_status(status))) + rows = await session.execute(q) + return [ + TaskCountRead(workstream_id=ws_id, status=task_status, count=count) + for ws_id, task_status, count in rows + ] + + @router.post("/", response_model=TaskRead, status_code=status.HTTP_201_CREATED) async def create_task( body: TaskCreate, diff --git a/api/routers/workstreams.py b/api/routers/workstreams.py index 6a1e24f..6e80094 100644 --- a/api/routers/workstreams.py +++ b/api/routers/workstreams.py @@ -3,6 +3,7 @@ import logging import uuid import socket import time +from datetime import datetime, timezone from pathlib import Path from typing import Any @@ -40,6 +41,8 @@ workplan_router = APIRouter(prefix="/workplans", tags=["workplans"]) _INDEX_CACHE: dict[str, Any] | None = None _INDEX_CACHE_AT: float = 0.0 _INDEX_TTL = 30.0 +_INDEX_REFRESH_TASK: asyncio.Task | None = None +_INDEX_LAST_ERROR: str | None = None _LEGACY_OWNER = "state-hub.api" _COMPLETED_WORKSTREAM_EVENT = "org.statehub.workstream.completed" @@ -170,16 +173,7 @@ async def _list_workstreams( return list(result.scalars().all()) -async def _workplan_index( - *, - refresh: bool, - session: AsyncSession, -) -> dict[str, Any]: - """Map file-backed workplan ids to their local workplan filenames.""" - global _INDEX_CACHE, _INDEX_CACHE_AT - if not refresh and _INDEX_CACHE is not None and (time.monotonic() - _INDEX_CACHE_AT) < _INDEX_TTL: - return _INDEX_CACHE - +async def _build_workplan_index(session: AsyncSession) -> dict[str, Any]: result = await session.execute( select(ManagedRepo).where(ManagedRepo.status == "active").order_by(ManagedRepo.slug) ) @@ -218,8 +212,78 @@ async def _workplan_index( "needs_review": bool(review and review.needs_review), "health_labels": ["needs_review"] if review and review.needs_review else [], } - _INDEX_CACHE = {"workplans": index, "workstreams": index} + return {"workplans": index, "workstreams": index} + + +def _index_with_meta(*, stale: bool, refresh_in_progress: bool) -> dict[str, Any]: + age = time.monotonic() - _INDEX_CACHE_AT if _INDEX_CACHE_AT else None + return { + **(_INDEX_CACHE or {"workplans": {}, "workstreams": {}}), + "_meta": { + "generated_at": _INDEX_CACHE.get("_meta", {}).get("generated_at") if _INDEX_CACHE else None, + "stale": stale, + "cache_age_seconds": round(age, 3) if age is not None else None, + "refresh_in_progress": refresh_in_progress, + "last_error": _INDEX_LAST_ERROR, + }, + } + + +async def _refresh_workplan_index_background() -> None: + global _INDEX_CACHE, _INDEX_CACHE_AT, _INDEX_LAST_ERROR + from api.database import async_session_factory + + try: + async with async_session_factory() as session: + index = await _build_workplan_index(session) + index["_meta"] = { + "generated_at": datetime.now(timezone.utc).isoformat(), + "stale": False, + "cache_age_seconds": 0.0, + "refresh_in_progress": False, + "last_error": None, + } + _INDEX_CACHE = index + _INDEX_CACHE_AT = time.monotonic() + _INDEX_LAST_ERROR = None + except Exception as exc: + _INDEX_LAST_ERROR = str(exc) + + +def _ensure_index_refresh_started() -> None: + global _INDEX_REFRESH_TASK + if _INDEX_REFRESH_TASK is not None and not _INDEX_REFRESH_TASK.done(): + return + _INDEX_REFRESH_TASK = asyncio.create_task(_refresh_workplan_index_background()) + + +async def _workplan_index( + *, + refresh: bool, + session: AsyncSession, +) -> dict[str, Any]: + """Map file-backed workplan ids to their local workplan filenames.""" + global _INDEX_CACHE, _INDEX_CACHE_AT, _INDEX_LAST_ERROR + cache_age = time.monotonic() - _INDEX_CACHE_AT if _INDEX_CACHE_AT else None + if not refresh and _INDEX_CACHE is not None and cache_age is not None and cache_age < _INDEX_TTL: + refresh_running = _INDEX_REFRESH_TASK is not None and not _INDEX_REFRESH_TASK.done() + return _index_with_meta(stale=False, refresh_in_progress=refresh_running) + + if not refresh and _INDEX_CACHE is not None: + _ensure_index_refresh_started() + return _index_with_meta(stale=True, refresh_in_progress=True) + + index = await _build_workplan_index(session) + index["_meta"] = { + "generated_at": datetime.now(timezone.utc).isoformat(), + "stale": False, + "cache_age_seconds": 0.0, + "refresh_in_progress": False, + "last_error": None, + } + _INDEX_CACHE = index _INDEX_CACHE_AT = time.monotonic() + _INDEX_LAST_ERROR = None return _INDEX_CACHE diff --git a/api/schemas/state.py b/api/schemas/state.py index 75d1213..6132dca 100644 --- a/api/schemas/state.py +++ b/api/schemas/state.py @@ -1,5 +1,6 @@ import uuid from datetime import datetime +from typing import Any from pydantic import BaseModel @@ -84,3 +85,51 @@ class StateSummary(BaseModel): contribution_counts: dict[str, int] = {} licence_risk_count: int = 0 open_capability_requests: int = 0 + + +class DashboardWorkplanRow(BaseModel): + id: uuid.UUID + title: str + status: str + domain: str = "unknown" + repo_label: str = "unassigned" + workplan_filename: str | None = None + workplan_relative_path: str | None = None + workplan_archived: bool = False + health_labels: list[str] = [] + href: str + done: int = 0 + progress: int = 0 + wait: int = 0 + todo: int = 0 + total: int = 0 + created_at: datetime + updated_at: datetime + + +class DashboardSourceMeta(BaseModel): + ok: bool = True + stale: bool = False + cache_age_seconds: float | None = None + refresh_in_progress: bool = False + error: str | None = None + + +class DashboardOverview(BaseModel): + generated_at: datetime + totals: Totals + topics: list[TopicWithWorkstreams] + blocking_decisions: list[DecisionRead] + waiting_tasks: list[TaskRead] + blocked_tasks: list[TaskRead] = [] + recent_progress: list[ProgressEventRead] + next_steps: list[NextStep] = [] + contribution_counts: dict[str, int] = {} + licence_risk_count: int = 0 + open_capability_requests: int = 0 + sbom_snapshot_count: int = 0 + sbom_package_total: int = 0 + registration_milestones: list[ProgressEventRead] = [] + workplan_rows: list[DashboardWorkplanRow] = [] + sources: dict[str, DashboardSourceMeta] = {} + diagnostics: dict[str, Any] = {} diff --git a/api/schemas/task.py b/api/schemas/task.py index f76924b..1b2ff9f 100644 --- a/api/schemas/task.py +++ b/api/schemas/task.py @@ -93,3 +93,9 @@ class TaskRead(TaskStatusMixin): parent_task_id: uuid.UUID | None = None created_at: datetime updated_at: datetime + + +class TaskCountRead(TaskStatusMixin): + workstream_id: uuid.UUID + status: TaskStatus + count: int diff --git a/dashboard/src/components/config.js b/dashboard/src/components/config.js index 3982cd8..5949f55 100644 --- a/dashboard/src/components/config.js +++ b/dashboard/src/components/config.js @@ -89,11 +89,23 @@ export async function waitForVisible(ms) { export async function apiFetch(path, options = {}) { const url = path.startsWith("http") ? path : `${API}${path}`; const timeout = options.timeout ?? FETCH_TIMEOUT; - const {timeout: _timeout, ...fetchOptions} = options; + const {timeout: _timeout, cache = "no-store", ...fetchOptions} = options; const ctrl = new AbortController(); - const timer = setTimeout(() => ctrl.abort(), timeout); + let timedOut = false; + const timer = setTimeout(() => { + timedOut = true; + ctrl.abort(); + }, timeout); try { - return await fetch(url, {cache: "no-store", ...fetchOptions, signal: ctrl.signal}); + return await fetch(url, {cache, ...fetchOptions, signal: ctrl.signal}); + } catch (error) { + if (timedOut || error?.name === "AbortError") { + const message = `Request timed out after ${Math.round(timeout / 1000)}s: ${url}`; + const timeoutError = new Error(message); + timeoutError.name = "TimeoutError"; + throw timeoutError; + } + throw error; } finally { clearTimeout(timer); } diff --git a/dashboard/src/docs/live-data.md b/dashboard/src/docs/live-data.md index b890129..2ec98e0 100644 --- a/dashboard/src/docs/live-data.md +++ b/dashboard/src/docs/live-data.md @@ -10,7 +10,10 @@ All dashboard pages poll the State Hub API automatically. No manual refresh is e ## Poll interval -Every page fetches fresh data from `http://127.0.0.1:8000` every **15 seconds** using an async generator loop. The previous data stays visible while the next request is in flight, so the UI never goes blank. +Most live pages fetch fresh data from `http://127.0.0.1:8000` every **15 seconds** +using an async generator loop. The overview page uses a heavier bounded read +model and refreshes every **60 seconds**. The previous data stays visible while +the next request is in flight, so the UI never goes blank. --- @@ -21,6 +24,7 @@ The **●** dot in the top-right corner of each page shows the current connectio | Indicator | Meaning | |---|---| | **● Live · updated HH:MM:SS** | Last poll succeeded — data is current as of that time | +| **● Stale · last successful update HH:MM:SS** | Last refresh failed, but cached page data is still visible | | **● Offline — run: `make api`** | API is unreachable — the dot turns red | The timestamp updates on every successful poll. If you see a time that is more than ~30 seconds in the past, the poll is stalled (browser tab backgrounded or network issue) — reloading the page resets the loop. @@ -48,7 +52,7 @@ make api # db + migrate + uvicorn (restarts if already running) | Page | Endpoints | |---|---| -| Overview | `/state/summary` | +| Overview | `/state/overview`, `/decisions/?decision_type=pending` | | Workplans | `/workplans/`, `/topics/`, `/state/summary` | | Decisions | `/decisions/?limit=500`, `/topics/` | | Progress | `/progress/?limit=500` | @@ -57,4 +61,4 @@ All endpoints are read-only GET requests. The dashboard never writes to the API. --- -*Poll interval: 15 s. Data is refreshed in the background — the page never reloads itself.* +*Poll interval: 15 s for most pages, 60 s for Overview. Data is refreshed in the background — the page never reloads itself.* diff --git a/dashboard/src/docs/overview.md b/dashboard/src/docs/overview.md index c7329c3..36ba5b2 100644 --- a/dashboard/src/docs/overview.md +++ b/dashboard/src/docs/overview.md @@ -82,9 +82,13 @@ and summary. ## Data source -Polls `GET /state/summary` every **15 seconds**. The workstream chart also polls -`GET /workplans/`, `GET /tasks/?limit=2000`, `GET /topics/`, `GET /repos/`, -and `GET /workplans/index` for repository grouping, task counts, and -workplan filename tooltips. Blocking decisions are fetched separately via -`GET /decisions/?decision_type=pending` and only re-fetched after a successful -resolve action — this prevents the inline form from being wiped on every poll. +Polls `GET /state/overview` every **60 seconds**. This endpoint is a bounded +dashboard read model: it returns summary totals, recent activity, registration +milestones, SBOM totals, and chart-ready workplan rows with task counts already +aggregated server-side. + +The page keeps the last successful overview response visible if a refresh times +out, and marks the view stale instead of clearing the dashboard. Blocking +decisions are fetched separately via `GET /decisions/?decision_type=pending` +and only re-fetched after a successful resolve action — this prevents the inline +form from being wiped on every poll. diff --git a/dashboard/src/index.md b/dashboard/src/index.md index aa4d069..182d614 100644 --- a/dashboard/src/index.md +++ b/dashboard/src/index.md @@ -14,11 +14,15 @@ import { ``` ```js -// Single polling loop — fetches all data in one Promise.all batch, backs off uniformly. +// Single polling loop — loads one bounded overview read model and keeps +// last-known-good data visible if a refresh times out. const pageState = (async function*() { let failures = 0; + let lastGood = null; while (true) { - let summary = {}, snapshots = [], totalPkgs = 0, milestones = [], wsAll = [], ok = false; + let nextState = lastGood + ? {...lastGood, ok: false, stale: true, error: null} + : {summary: {}, snapshots: [], snapshotCount: 0, totalPkgs: 0, milestones: [], wsAll: [], ok: false, stale: false, error: null, sources: {}, ts: new Date()}; try { const loadJson = async (name, path, options = {}) => { const response = await apiFetch(path, options); @@ -26,67 +30,71 @@ const pageState = (async function*() { return response.json(); }; - const [ - summaryData, - snapList, - allEvents, - wsList, - taskList, - topicList, - repoList, - workplanIndex, - ] = await Promise.all([ - loadJson("summary", "/state/summary", {timeout: 20_000}), - loadJson("sbom snapshots", "/sbom/snapshots/"), - loadJson("milestones", "/progress/?event_type=milestone&limit=500"), - loadJson("workplans", "/workplans/"), - loadJson("tasks", "/tasks/?limit=2000"), - loadJson("topics", "/topics/"), - loadJson("repos", "/repos/"), - loadJson("workplan index", "/workplans/index").catch(() => ({workplans: {}, workstreams: {}})), - ]); + const overview = await loadJson("overview", "/state/overview", {timeout: 20_000, cache: "reload"}); - ok = true; - summary = summaryData; - snapshots = snapList; - totalPkgs = snapshots.reduce((s, sn) => s + (sn.entry_count ?? 0), 0); - milestones = allEvents.filter(e => e.summary?.startsWith("Project registered with State Hub:")); - const workplanMap = workplanIndex.workstreams ?? {}; - const topicMap = Object.fromEntries(topicList.map(t => [t.id, t])); - const repoMap = Object.fromEntries(repoList.map(r => [r.id, r])); - const counts = {}; - for (const t of taskList) { - const wid = t.workstream_id; - if (!counts[wid]) counts[wid] = {done: 0, progress: 0, wait: 0, todo: 0, total: 0}; - counts[wid].total++; - if (t.status === "done") counts[wid].done++; - else if (t.status === "progress") counts[wid].progress++; - else if (t.status === "wait") counts[wid].wait++; - else if (t.status === "todo") counts[wid].todo++; - } - wsAll = wsList.map(w => { - const repo = repoMap[w.repo_id]; - const topic = topicMap[w.topic_id]; - const workplan = workplanMap[w.id] ?? {}; - return { + const summaryData = { + generated_at: overview.generated_at, + totals: overview.totals ?? {}, + topics: overview.topics ?? [], + blocking_decisions: overview.blocking_decisions ?? [], + waiting_tasks: overview.waiting_tasks ?? [], + blocked_tasks: overview.blocked_tasks ?? overview.waiting_tasks ?? [], + recent_progress: overview.recent_progress ?? [], + next_steps: overview.next_steps ?? [], + contribution_counts: overview.contribution_counts ?? {}, + licence_risk_count: overview.licence_risk_count ?? 0, + open_capability_requests: overview.open_capability_requests ?? 0, + }; + + nextState = { + summary: summaryData, + snapshots: [], + snapshotCount: overview.sbom_snapshot_count ?? 0, + totalPkgs: overview.sbom_package_total ?? 0, + milestones: overview.registration_milestones ?? [], + wsAll: (overview.workplan_rows ?? []).map(w => ({ ...w, status: normalizeWorkstreamStatus(w.status), - domain: repo?.domain_slug ?? topic?.domain_slug ?? "unknown", - repo_label: repo?.slug ?? workplan.repo_slug ?? "unassigned", - workplan_filename: workplan.filename ?? null, - workplan_relative_path: workplan.relative_path ?? null, - workplan_archived: workplan.archived ?? false, - health_labels: workplan.health_labels ?? [], - href: `./workstreams/${w.id}`, - ...(counts[w.id] ?? {done: 0, progress: 0, wait: 0, todo: 0, total: 0}), - }; - }); + })), + ok: true, + stale: false, + error: null, + sources: overview.sources ?? {}, + ts: new Date(), + }; + lastGood = nextState; } catch (e) { - summary = {error: `Dashboard data load failed: ${e?.message ?? String(e)}`}; + const message = `Dashboard refresh failed: ${e?.message ?? String(e)}`; + if (lastGood) { + nextState = { + ...lastGood, + ok: false, + stale: true, + error: `${message}; showing last successful data from ${lastGood.ts?.toLocaleTimeString?.() ?? "previous refresh"}`, + summary: { + ...(lastGood.summary ?? {}), + error: `${message}; showing last successful data from ${lastGood.ts?.toLocaleTimeString?.() ?? "previous refresh"}`, + }, + }; + } else { + nextState = { + summary: {error: message}, + snapshots: [], + snapshotCount: 0, + totalPkgs: 0, + milestones: [], + wsAll: [], + ok: false, + stale: false, + error: message, + sources: {}, + ts: new Date(), + }; + } } - failures = ok ? 0 : failures + 1; - yield {summary, snapshots, totalPkgs, milestones, wsAll, ok, ts: new Date()}; - await waitForVisible(pollDelay({ok, base: POLL_HEAVY, failures})); + failures = nextState.ok ? 0 : failures + 1; + yield nextState; + await waitForVisible(pollDelay({ok: nextState.ok, base: POLL_HEAVY, failures})); } })(); ``` @@ -94,6 +102,7 @@ const pageState = (async function*() { ```js const summary = pageState.summary ?? {}; const _ok = pageState.ok ?? false; +const _stale = pageState.stale ?? false; const _ts = pageState.ts; const totals = summary.totals ?? {}; const ws = totals.workstreams ?? {}; @@ -107,7 +116,7 @@ const wsAll = pageState.wsAll ?? []; // Kept separate from the main poll so in-progress form inputs aren't wiped every 60 s. const blockingDecisions = Mutable([]); const refreshDecisions = async () => { - const r = await fetch(`${API}/decisions/?decision_type=pending`).catch(() => null); + const r = await apiFetch("/decisions/?decision_type=pending", {timeout: 12_000}).catch(() => null); const all = r?.ok ? await r.json() : []; blockingDecisions.value = all.filter(d => ["open", "escalated"].includes(d.status)); }; @@ -121,9 +130,11 @@ import {injectTocTop} from "./components/toc-sidebar.js"; import {withDocHelp} from "./components/doc-overlay.js"; const _liveEl = html`
- + ${_ok ? `Live · updated ${_ts?.toLocaleTimeString()}` + : _stale + ? `Stale · last successful update ${_ts?.toLocaleTimeString()}` : html`Offline — run: cd ~/state-hub && make api`}
`; withDocHelp(_liveEl, "/docs/live-data"); @@ -346,6 +357,7 @@ const licenceRisk = summary.licence_risk_count ?? 0; const totalContribs = ["br","fr","ep","upr"].reduce((s, t) => s + (contribCounts[t] ?? 0), 0); const needsFollowUp = (contribCounts["submitted"] ?? 0) + (contribCounts["acknowledged"] ?? 0); const sbomSnaps = pageState.snapshots ?? []; +const sbomSnapCount = pageState.snapshotCount ?? sbomSnaps.length; const totalPkgs = pageState.totalPkgs ?? 0; display(html`
@@ -362,7 +374,7 @@ display(html`

SBOM

${totalPkgs.toLocaleString()}

- ${sbomSnaps.length} repo${sbomSnaps.length !== 1 ? "s" : ""} tracked · ${licenceRisk > 0 ? html`${licenceRisk} copyleft risks` : html`✓ no copyleft`} + ${sbomSnapCount} snapshot${sbomSnapCount !== 1 ? "s" : ""} tracked · ${licenceRisk > 0 ? html`${licenceRisk} copyleft risks` : html`✓ no copyleft`}
`); ``` diff --git a/scripts/smoke_dashboard_load.sh b/scripts/smoke_dashboard_load.sh new file mode 100644 index 0000000..bd434bc --- /dev/null +++ b/scripts/smoke_dashboard_load.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +set -euo pipefail + +API_PORT="${API_PORT:-8012}" +DASHBOARD_PORT="${DASHBOARD_PORT:-3012}" +API_BASE="http://127.0.0.1:${API_PORT}" +DASHBOARD_URL="http://127.0.0.1:${DASHBOARD_PORT}/?api_base=${API_BASE}" + +API_LOG="${API_LOG:-/tmp/statehub-api-${API_PORT}.log}" +DASHBOARD_LOG="${DASHBOARD_LOG:-/tmp/statehub-dashboard-${DASHBOARD_PORT}.log}" +OVERVIEW_JSON="${OVERVIEW_JSON:-/tmp/statehub-overview-${API_PORT}.json}" +OVERVIEW_HEADERS="${OVERVIEW_HEADERS:-/tmp/statehub-overview-${API_PORT}.headers}" +DASHBOARD_HTML="${DASHBOARD_HTML:-/tmp/statehub-dashboard-${DASHBOARD_PORT}.html}" + +rm -f "$API_LOG" "$DASHBOARD_LOG" "$OVERVIEW_JSON" "$OVERVIEW_HEADERS" "$DASHBOARD_HTML" + +.venv/bin/python -m uvicorn api.main:app --host 127.0.0.1 --port "$API_PORT" \ + > "$API_LOG" 2>&1 & +api_pid=$! + +(cd dashboard && npm run dev -- --host 127.0.0.1 --port "$DASHBOARD_PORT" \ + > "$DASHBOARD_LOG" 2>&1) & +dashboard_pid=$! + +cleanup() { + kill "$api_pid" "$dashboard_pid" 2>/dev/null || true +} +trap cleanup EXIT + +wait_for_url() { + local label="$1" + local url="$2" + local output="$3" + local attempts="${4:-40}" + local i + for i in $(seq 1 "$attempts"); do + if curl -fsS "$url" -o "$output" >/dev/null 2>&1; then + return 0 + fi + sleep 1 + done + echo "$label did not become ready: $url" >&2 + return 1 +} + +if ! wait_for_url "API overview" "${API_BASE}/state/overview" "$OVERVIEW_JSON"; then + echo "API log:" >&2 + tail -80 "$API_LOG" >&2 || true + exit 1 +fi + +if ! wait_for_url "Dashboard" "$DASHBOARD_URL" "$DASHBOARD_HTML"; then + echo "Dashboard log:" >&2 + tail -80 "$DASHBOARD_LOG" >&2 || true + exit 1 +fi + +curl -sS -D "$OVERVIEW_HEADERS" -o "$OVERVIEW_JSON" \ + -w "overview %{http_code} %{time_total} %{size_download}\n" \ + "${API_BASE}/state/overview" +printf "dashboard 200 %s\n" "$DASHBOARD_URL" +wc -c "$OVERVIEW_JSON" +grep -i "x-statehub" "$OVERVIEW_HEADERS" || true diff --git a/tests/conftest.py b/tests/conftest.py index 8a06a04..98fd422 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -56,8 +56,12 @@ def _truncate(_schema): # Reset in-process TTL caches so stale data from a previous test can't bleed through. _state_router._SUMMARY_CACHE = None _state_router._SUMMARY_CACHE_AT = 0.0 + _state_router._OVERVIEW_CACHE = None + _state_router._OVERVIEW_CACHE_AT = 0.0 _ws_router._INDEX_CACHE = None _ws_router._INDEX_CACHE_AT = 0.0 + _ws_router._INDEX_REFRESH_TASK = None + _ws_router._INDEX_LAST_ERROR = None yield engine = sqlalchemy.create_engine(_SYNC_URL) diff --git a/tests/test_routers_core.py b/tests/test_routers_core.py index e2df0f8..2feee97 100644 --- a/tests/test_routers_core.py +++ b/tests/test_routers_core.py @@ -226,6 +226,34 @@ class TestTasks: assert "High prio" in titles assert "Low prio" not in titles + async def test_list_pagination_and_counts(self, client): + await _create_domain(client) + topic = await _create_topic(client) + ws = await _create_workstream(client, topic["id"]) + first = await _create_task(client, ws["id"], title="First") + second = await _create_task(client, ws["id"], title="Second") + third = await _create_task(client, ws["id"], title="Third") + await client.patch(f"/tasks/{second['id']}", json={"status": "progress"}) + await client.patch(f"/tasks/{third['id']}", json={"status": "wait", "blocking_reason": "blocked"}) + + r = await client.get("/tasks/?limit=2") + assert r.status_code == 200 + body = r.json() + assert len(body) == 2 + assert body[0]["id"] == first["id"] + assert body[1]["id"] == second["id"] + + r = await client.get("/tasks/?limit=1&offset=2") + assert r.status_code == 200 + assert [task["id"] for task in r.json()] == [third["id"]] + + r = await client.get(f"/tasks/counts?workstream_id={ws['id']}") + assert r.status_code == 200 + counts = {(row["workstream_id"], row["status"]): row["count"] for row in r.json()} + assert counts[(ws["id"], "todo")] == 1 + assert counts[(ws["id"], "progress")] == 1 + assert counts[(ws["id"], "wait")] == 1 + @pytest.mark.parametrize("initial_status", ["proposed", "ready", "backlog"]) async def test_task_start_activates_planning_workstream(self, client, initial_status): await _create_domain(client) @@ -358,6 +386,34 @@ class TestStateSummary: assert summaries[blocked_ws["id"]]["blocked_reasons"][0]["id"] == "dependencies.all_complete" assert body["totals"]["workstreams"]["blocked"] == 1 + async def test_overview_returns_chart_ready_rows(self, client): + await _create_domain(client) + topic = await _create_topic(client) + repo = await _create_repo(client) + ws = await _create_workstream(client, topic["id"], repo_id=repo["id"]) + first = await _create_task(client, ws["id"], title="Todo") + second = await _create_task(client, ws["id"], title="Done") + await client.patch(f"/tasks/{second['id']}", json={"status": "done", "suppress_token_event": True}) + + r = await client.get("/state/overview") + assert r.status_code == 200 + assert r.headers["x-statehub-cache"] == "miss" + body = r.json() + + rows = {row["id"]: row for row in body["workplan_rows"]} + assert ws["id"] in rows + assert rows[ws["id"]]["repo_label"] == "test-repo" + assert rows[ws["id"]]["domain"] == "testdomain" + assert rows[ws["id"]]["todo"] == 1 + assert rows[ws["id"]]["done"] == 1 + assert rows[ws["id"]]["total"] == 2 + assert body["totals"]["tasks"]["total"] == 2 + assert body["diagnostics"]["task_count_strategy"] == "grouped" + + r = await client.get("/state/overview") + assert r.status_code == 200 + assert r.headers["x-statehub-cache"] == "hit" + class TestFlowEndpoints: async def test_list_flow_definitions(self, client): diff --git a/workplans/STATE-WP-0056-dashboard-loading-robustness.md b/workplans/STATE-WP-0056-dashboard-loading-robustness.md new file mode 100644 index 0000000..28c9e83 --- /dev/null +++ b/workplans/STATE-WP-0056-dashboard-loading-robustness.md @@ -0,0 +1,276 @@ +--- +id: STATE-WP-0056 +type: workplan +title: "Dashboard Loading Robustness and Efficiency" +domain: custodian +repo: state-hub +status: finished +owner: codex +topic_slug: custodian +created: "2026-06-05" +updated: "2026-06-05" +state_hub_workstream_id: "28f9569c-937b-4b79-b46c-f6b1f83c09c3" +--- + +# Dashboard Loading Robustness and Efficiency + +## Summary + +Make the State Hub dashboard overview page faster and more resilient under +normal polling. The current overview performs a broad concurrent fan-out of +full-list API calls and treats most request failures as whole-page failures. +This can surface frequent `Dashboard data load failed: The operation was +aborted.` warnings when one call crosses the frontend timeout, even if the API +eventually returns successfully. + +This work should reduce request count, payload size, and backend contention; +preserve useful last-known data during partial failures; and give operators +clearer diagnostics when a section is stale or unavailable. + +## Current Findings + +Inspection on 2026-06-05 found: + +- `dashboard/src/index.md` loads overview data with one eight-request + `Promise.all` batch. +- `dashboard/src/components/config.js` aborts most `apiFetch` calls after + `12_000` ms. +- A dashboard-style concurrent timing run produced several calls at or above the + default timeout: `/sbom/snapshots/`, `/repos/`, and `/workplans/index`. +- The same endpoints can be much faster when called alone, which points to + contention and over-fetching rather than one permanently slow endpoint. +- The overview calls `/tasks/?limit=2000`, but the tasks API currently ignores + `limit` and returns every task. In the observed run that response was roughly + 2.1 MB just to compute per-workplan task counts. +- `/state/summary` has a short in-process cache, but a cache miss still runs a + large amount of sequential database and Python-side aggregation work. +- `/workplans/index` scans active repository workplan files and parses + frontmatter. It is cached, but concurrent dashboard loads can still wait on + the same expensive rebuild pattern. +- Several API routes set cache headers, but the shared dashboard fetch helper + forces `cache: "no-store"` for every request. + +## Out of Scope + +- Replacing Observable Framework. +- Redesigning the dashboard information architecture. +- Adding authentication, authorization, or multi-user session handling. +- Changing workplan file conventions. +- Moving State Hub to a different database or deployment substrate. + +## T01 — Add Focused Dashboard Load Instrumentation + +```task +id: STATE-WP-0056-T01 +status: done +priority: high +state_hub_task_id: "e5208053-0db1-4842-a221-c5289422677a" +``` + +Add enough timing and error visibility to confirm which overview calls are slow, +aborted, or oversized during normal use. + +Implementation notes: + +- Add lightweight server-side timing logs or response headers for overview-hot + endpoints: `/state/summary`, `/workplans/`, `/tasks/`, `/topics/`, `/repos/`, + `/sbom/snapshots/`, `/progress/`, and `/workplans/index`. +- Include request path, status, elapsed time, response size when practical, and + whether a cached result was used. +- Keep instrumentation local and low-noise; avoid logging full payloads or + secrets. +- Add a small dashboard diagnostic surface or console logging that distinguishes + timeout aborts from HTTP errors and network failures. +- Capture before/after timing notes in this workplan or a progress event. + +Done when a normal dashboard refresh can be diagnosed without manually timing +each endpoint from a shell. + +## T02 — Make Overview Polling Partially Resilient + +```task +id: STATE-WP-0056-T02 +status: done +priority: high +state_hub_task_id: "2cdd960d-ba86-48d1-a7c6-e83671cd0e69" +``` + +Change the overview data loader so one slow or failed secondary request does +not mark the whole dashboard as failed. + +Implementation notes: + +- Replace fail-fast `Promise.all` behavior in `dashboard/src/index.md` with a + per-resource result model, for example `Promise.allSettled`. +- Keep last-known-good data for each section while a refresh is degraded. +- Treat optional resources such as SBOM snapshots, registration milestones, and + workplan file metadata independently from core summary/workplan status data. +- Display section-level stale/error indicators instead of one global warning + whenever possible. +- Keep exponential backoff for repeated failures, but do not discard usable + data just because one request timed out. +- Make abort errors user-readable, for example "timed out after 12s" instead of + only "The operation was aborted." + +Done when an SBOM, repo-list, or workplan-index timeout leaves the rest of the +overview usable and visibly stale rather than failed. + +## T03 — Respect Pagination and Add Task Count Aggregates + +```task +id: STATE-WP-0056-T03 +status: done +priority: high +state_hub_task_id: "78484226-9ccc-460c-a2b3-750b3204caa3" +``` + +Stop returning all tasks for overview count calculations. + +Implementation notes: + +- Add `limit` and `offset` support to `GET /tasks/`, preserving existing filter + behavior and sensible limits. +- Add a lightweight aggregate endpoint for task counts by workplan and status, + for example `GET /tasks/counts?group_by=workstream,status`, or add an + overview-specific aggregate route. +- Prefer SQL `GROUP BY` over transferring every task to the browser. +- Update `dashboard/src/index.md`, `dashboard/src/tasks.md`, + `dashboard/src/interventions.md`, and workplan detail pages as needed so list + views still receive the rows they need. +- Add tests for pagination compatibility and aggregate counts. + +Done when the overview no longer fetches the full task table to draw the +workplan chart. + +## T04 — Build a Lightweight Overview Read Endpoint + +```task +id: STATE-WP-0056-T04 +status: done +priority: high +state_hub_task_id: "2cf47a12-e8aa-49ca-963c-1f0d2933c344" +``` + +Create a dashboard-specific read model that returns exactly the data needed by +the overview page in one bounded response. + +Implementation notes: + +- Add an endpoint such as `GET /state/overview` or + `GET /state/dashboard-overview`. +- Include summary totals, recent progress needed by the page, blocking decision + counts, waiting-task counts, SBOM snapshot totals, registration milestones, + and workplan chart rows with repo/domain labels and task counts. +- Keep response fields stable and documented in dashboard reference docs. +- Reuse existing summary helpers where they are efficient, but avoid serializing + large full-list payloads that the overview does not display directly. +- Add cache headers and a short in-process cache with explicit invalidation + rules where appropriate. +- Update `dashboard/src/index.md` to prefer this endpoint and remove redundant + overview-only fetches. + +Done when the overview's steady-state refresh is one bounded API call plus only +truly interactive secondary calls. + +## T05 — Add Stale-While-Refresh for File-Backed Workplan Index + +```task +id: STATE-WP-0056-T05 +status: done +priority: medium +state_hub_task_id: "0c88c1a2-588b-41f8-bc1c-f94c8b4b0d1a" +``` + +Make `/workplans/index` resilient when repository filesystem scans are slow. + +Implementation notes: + +- Add singleflight behavior so concurrent requests share one in-progress + rebuild instead of starting or waiting on redundant scans. +- Return stale cached data quickly while a background refresh runs when the + cache is expired but still available. +- Include metadata such as `generated_at`, `stale`, `cache_age_seconds`, and + optionally `refresh_in_progress`. +- Consider reading only frontmatter rather than whole markdown files if this + can be done cleanly. +- Keep `refresh=true` as an explicit operator escape hatch. +- Add tests for cache hit, stale return, and forced refresh behavior. + +Done when a slow filesystem scan cannot block normal dashboard refreshes for +longer than the frontend timeout if cached data exists. + +## T06 — Use Browser and HTTP Caching Selectively + +```task +id: STATE-WP-0056-T06 +status: done +priority: medium +state_hub_task_id: "811f02ff-2e92-4c82-8b8a-e3d39a450b02" +``` + +Let stable lookup requests benefit from cache headers instead of forcing every +dashboard request to bypass caches. + +Implementation notes: + +- Extend `apiFetch` so callers can choose cache mode. +- Keep `no-store` for volatile mutation-sensitive resources. +- Use default browser caching or `reload` only where route cache headers are + already intentional, such as repo/topic lookup data. +- Review current route cache headers and align them with dashboard polling + needs. +- Avoid stale cached data for controls that immediately follow a mutation. + +Done when stable overview lookup data no longer bypasses useful cache headers +by default. + +## T07 — Optimize `/state/summary` Cache Misses + +```task +id: STATE-WP-0056-T07 +status: done +priority: medium +state_hub_task_id: "633f4cc6-ffeb-4086-9858-d239f50a9686" +``` + +Reduce the cost of a cold or expired `/state/summary` request. + +Implementation notes: + +- Profile the current sequential query groups in `api/routers/state.py`. +- Move Python-side counts and scans into SQL where straightforward. +- Remove unused work from the summary path, such as dead intermediate query + results. +- Cache derived sections independently when their freshness requirements differ. +- Add indexes only after profiling shows a query plan needs them. +- Keep summary response compatibility for existing consumers and MCP smoke + tests. + +Done when a summary cache miss stays comfortably below the frontend timeout +under the current local data volume. + +## T08 — Verify Under Dashboard-Style Load + +```task +id: STATE-WP-0056-T08 +status: done +priority: high +state_hub_task_id: "353fb25a-5306-416b-8d6d-9b201e6fac87" +``` + +Prove the dashboard no longer produces frequent abort warnings under realistic +refresh behavior. + +Implementation notes: + +- Add or document a repeatable script that performs dashboard-style concurrent + endpoint timing before and after the changes. +- Run API tests and dashboard component tests. +- Open the dashboard locally and verify that initial load, refresh, hidden-tab + pause/resume, and partial API failure states behave correctly. +- Confirm payload sizes are lower than the baseline for the overview page. +- Update `dashboard/src/docs/overview.md` and `dashboard/src/docs/live-data.md` + with the new data-loading model. + +Done when repeated dashboard refreshes do not show the global aborted-operation +warning during normal local operation, and degraded sections recover cleanly.