"""Missed-fire detection for cron schedules (ACTIVITY-WP-0014, T03). Even with a catchup window configured, an operator wants to *know* when a fire was missed — especially under ``misfire_policy: skip`` where missed fires are dropped by design and leave no run and no failure event. This module turns the schedule's own bookkeeping into an explicit verdict and an optional State Hub alert so a miss is never invisible again. Temporal already counts fires that were dropped because they fell outside the catchup window in ``ScheduleInfo.num_actions_missed_catchup_window``. We surface that, plus a staleness check on the most recent fire, as a ``ScheduleHealth`` verdict. The verdict logic is a pure function so it is testable without a live Temporal server; ``check_schedule_health`` is the thin async reader. """ from __future__ import annotations import os from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone from typing import Any from uuid import UUID import httpx from activity_core.schedule_manager import schedule_id from activity_core.state_hub_write import idempotency_headers _DEFAULT_STATE_HUB_URL = "http://127.0.0.1:8000" @dataclass(frozen=True) class ScheduleHealth: """Verdict for a single schedule's recent firing behaviour.""" activity_id: str healthy: bool missed_catchup_window: int last_fired_at: datetime | None staleness: timedelta | None reasons: list[str] = field(default_factory=list) @property def missed(self) -> bool: return not self.healthy def evaluate_schedule_health( *, activity_id: str, missed_catchup_window: int, last_fired_at: datetime | None, now: datetime, expected_interval: timedelta | None = None, tolerance: timedelta = timedelta(minutes=10), ) -> ScheduleHealth: """Pure verdict: was a fire missed? A schedule is unhealthy if Temporal dropped any fire past the catchup window, or — when ``expected_interval`` is known — if the most recent fire is older than one interval plus ``tolerance`` (i.e. a fire should have happened and did not). """ reasons: list[str] = [] if missed_catchup_window > 0: reasons.append( f"{missed_catchup_window} fire(s) dropped outside the catchup window" ) staleness: timedelta | None = None if last_fired_at is not None: staleness = now - last_fired_at if expected_interval is not None and staleness > expected_interval + tolerance: reasons.append( f"last fire was {staleness} ago, exceeding the expected " f"{expected_interval} interval" ) elif expected_interval is not None: reasons.append("no recorded fire for a schedule that should have fired") return ScheduleHealth( activity_id=activity_id, healthy=not reasons, missed_catchup_window=missed_catchup_window, last_fired_at=last_fired_at, staleness=staleness, reasons=reasons, ) def _extract_info(desc: Any) -> tuple[int, datetime | None]: """Pull (missed_catchup_window, last_fired_at) from a ScheduleDescription. Accesses are defensive so a Temporal SDK field rename degrades to "unknown" rather than raising inside an operational health check. """ info = getattr(desc, "info", None) missed = int(getattr(info, "num_actions_missed_catchup_window", 0) or 0) last_fired: datetime | None = None recent = getattr(info, "recent_actions", None) or [] times = [ getattr(a, "scheduled_at", None) or getattr(a, "started_at", None) for a in recent ] times = [t for t in times if t is not None] if times: last_fired = max(times) return missed, last_fired async def check_schedule_health( client: Any, activity_id: str | UUID, *, now: datetime | None = None, expected_interval: timedelta | None = None, tolerance: timedelta = timedelta(minutes=10), ) -> ScheduleHealth: """Describe the schedule for ``activity_id`` and evaluate its health.""" now = now or datetime.now(tz=timezone.utc) handle = client.get_schedule_handle(schedule_id(activity_id)) desc = await handle.describe() missed, last_fired = _extract_info(desc) return evaluate_schedule_health( activity_id=str(activity_id), missed_catchup_window=missed, last_fired_at=last_fired, now=now, expected_interval=expected_interval, tolerance=tolerance, ) def post_missed_fire_alert( health: ScheduleHealth, *, state_hub_url: str | None = None, author: str = "activity-core", topic_id: str | None = None, workstream_id: str | None = None, timeout_seconds: float = 10.0, ) -> dict[str, Any]: """Post a ``schedule_miss`` progress event to State Hub for an unhealthy schedule. No-op (returns ``status: ok``) when the schedule is healthy, so callers can invoke unconditionally. """ if health.healthy: return {"type": "schedule-miss-alert", "status": "ok"} base_url = state_hub_url or os.environ.get("STATE_HUB_URL", _DEFAULT_STATE_HUB_URL) base_url = str(base_url).rstrip("/") body: dict[str, Any] = { "event_type": "schedule_miss", "author": author, "summary": ( f"Schedule {health.activity_id} missed a fire: " + "; ".join(health.reasons) ), "detail": { "activity_id": health.activity_id, "missed_catchup_window": health.missed_catchup_window, "last_fired_at": ( health.last_fired_at.isoformat() if health.last_fired_at else None ), "staleness_seconds": ( health.staleness.total_seconds() if health.staleness else None ), "reasons": health.reasons, }, } if topic_id: body["topic_id"] = topic_id if workstream_id: body["workstream_id"] = workstream_id # Dedup repeated alerts for the same missed window (same schedule + last fire). last_fired = health.last_fired_at.isoformat() if health.last_fired_at else "none" resp = httpx.post( f"{base_url}/progress/", json=body, headers=idempotency_headers("schedule_miss", health.activity_id, last_fired), timeout=timeout_seconds, ) resp.raise_for_status() data = resp.json() return { "type": "schedule-miss-alert", "status": "posted", "progress_id": data.get("id"), }