feat(ACTIVITY-WP-0014): explicit run-miss recovery policies (T02, T04)

Set Temporal catchup_window on cron schedules so a fire missed during a
worker/Temporal outage is no longer silently dropped. Redefine misfire_policy
into three explicit modes — skip, catchup_all, catchup_latest — mapping to
(catchup_window, overlap) pairs; legacy catchup/compress aliased. Add
catchup_window_seconds override. Remove the ad-hoc upsert-time 1h backfill in
favour of native catchup. Apply catchup_latest to daily-statehub-wsjf-triage in
the Railiance runtime manifest and document run-miss policies in the runbook.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-23 14:15:45 +02:00
parent ffc0ee2cb7
commit a83b117f60
6 changed files with 181 additions and 29 deletions

View File

@@ -49,7 +49,18 @@ class CronTriggerConfig(BaseModel):
)
timezone: str = Field(default="UTC", description="IANA timezone name.")
jitter_seconds: int = Field(default=0, ge=0)
misfire_policy: Literal["skip", "catchup", "compress"] = Field(default="skip")
# Run-miss recovery behaviour (ACTIVITY-WP-0014). What happens when a fire is
# missed because the worker / Temporal was unavailable at trigger time:
# skip - run on trigger or skip; a missed fire is never recovered
# catchup_all - recover every fire missed during the outage window
# catchup_latest - recover only the most recent missed fire; do not accumulate
# Legacy aliases are accepted: catchup → catchup_all, compress → catchup_latest.
misfire_policy: Literal[
"skip", "catchup_all", "catchup_latest", "catchup", "compress"
] = Field(default="skip")
# Override the per-policy default catchup window (how far back Temporal will
# recover missed fires after an outage). None uses the policy default.
catchup_window_seconds: int | None = Field(default=None, ge=0)
class EventTriggerConfig(BaseModel):

View File

@@ -17,7 +17,6 @@ from temporalio.client import (
Schedule,
ScheduleActionStartWorkflow,
ScheduleAlreadyRunningError,
ScheduleBackfill,
ScheduleCalendarSpec,
ScheduleHandle,
ScheduleOverlapPolicy,
@@ -38,13 +37,49 @@ _ORCHESTRATOR_TASK_QUEUE = "orchestrator-tq"
# RunActivityWorkflow detects this value and derives run dedup key from workflow_id.
SCHEDULED_TRIGGER_KEY = "scheduled"
# T24: misfire_policy → ScheduleOverlapPolicy
_MISFIRE_TO_OVERLAP: dict[str, ScheduleOverlapPolicy] = {
"skip": ScheduleOverlapPolicy.SKIP,
"catchup": ScheduleOverlapPolicy.BUFFER_ALL,
"compress": ScheduleOverlapPolicy.BUFFER_ONE,
# ACTIVITY-WP-0014: misfire_policy → run-miss recovery behaviour.
#
# A "missed fire" happens when the worker / Temporal is unavailable at trigger
# time. Two Temporal levers together define the behaviour:
# - catchup_window: how far back the server will recover missed fires once it
# is healthy again. The previous code never set this, so a brief outage at
# trigger time silently dropped the fire with no recovery and no signal.
# - overlap: what to do when a (recovered) fire would start while a prior run
# is still executing.
#
# Legacy values (catchup, compress) are aliased onto the explicit names.
_MISFIRE_ALIASES: dict[str, str] = {
"catchup": "catchup_all",
"compress": "catchup_latest",
}
# overlap policy + default catchup window (seconds) per normalised policy.
_SKIP_WINDOW_SECONDS = 60
_CATCHUP_ALL_WINDOW_SECONDS = 365 * 24 * 3600
_CATCHUP_LATEST_WINDOW_SECONDS = 24 * 3600
_MISFIRE_TO_OVERLAP: dict[str, ScheduleOverlapPolicy] = {
# Run on trigger or skip — recover nothing past a tiny grace window.
"skip": ScheduleOverlapPolicy.SKIP,
# Run on trigger or recover every missed fire during the outage window.
"catchup_all": ScheduleOverlapPolicy.BUFFER_ALL,
# Run on trigger or recover the most recent missed fire only; BUFFER_ONE
# buffers at most one start and drops the rest, so a backlog never accumulates.
"catchup_latest": ScheduleOverlapPolicy.BUFFER_ONE,
}
_MISFIRE_DEFAULT_WINDOW: dict[str, int] = {
"skip": _SKIP_WINDOW_SECONDS,
"catchup_all": _CATCHUP_ALL_WINDOW_SECONDS,
"catchup_latest": _CATCHUP_LATEST_WINDOW_SECONDS,
}
def _normalize_misfire_policy(misfire_policy: str) -> str:
"""Map legacy aliases onto the explicit run-miss policy names."""
canonical = _MISFIRE_ALIASES.get(misfire_policy, misfire_policy)
return canonical if canonical in _MISFIRE_TO_OVERLAP else "skip"
def schedule_id(activity_id: str | UUID) -> str:
"""Return the canonical Temporal Schedule ID for an ActivityDefinition."""
@@ -57,7 +92,15 @@ def smoke_schedule_id(activity_id: str | UUID) -> str:
def _overlap_policy(misfire_policy: str) -> ScheduleOverlapPolicy:
return _MISFIRE_TO_OVERLAP.get(misfire_policy, ScheduleOverlapPolicy.SKIP)
return _MISFIRE_TO_OVERLAP[_normalize_misfire_policy(misfire_policy)]
def _catchup_window(cfg: CronTriggerConfig) -> timedelta:
"""Resolve the catchup window: explicit override, else the policy default."""
if cfg.catchup_window_seconds is not None:
return timedelta(seconds=cfg.catchup_window_seconds)
policy = _normalize_misfire_policy(cfg.misfire_policy)
return timedelta(seconds=_MISFIRE_DEFAULT_WINDOW[policy])
def _build_schedule(defn: ActivityDefinition) -> Schedule:
@@ -80,7 +123,10 @@ def _build_schedule(defn: ActivityDefinition) -> Schedule:
jitter=timedelta(seconds=cfg.jitter_seconds) if cfg.jitter_seconds else None,
)
policy = SchedulePolicy(overlap=_overlap_policy(cfg.misfire_policy))
policy = SchedulePolicy(
overlap=_overlap_policy(cfg.misfire_policy),
catchup_window=_catchup_window(cfg),
)
state = ScheduleState(paused=not defn.enabled)
return Schedule(action=action, spec=spec, policy=policy, state=state)
@@ -282,18 +328,10 @@ async def upsert_schedule(client: Client, defn: ActivityDefinition) -> ScheduleH
else:
await handle.pause(note="disabled via upsert_schedule")
# T24 catchup: backfill any fires missed in the last hour.
if isinstance(defn.trigger_config, CronTriggerConfig):
if defn.trigger_config.misfire_policy == "catchup":
now = datetime.now(tz=timezone.utc)
backfill_start = now - timedelta(hours=1)
await handle.backfill(
ScheduleBackfill(
start_at=backfill_start,
end_at=now,
overlap=ScheduleOverlapPolicy.BUFFER_ALL,
)
)
# ACTIVITY-WP-0014: missed-fire recovery is now handled natively by the
# schedule's catchup_window (see _build_schedule), which the server applies
# continuously after any outage — not only at upsert time. The previous
# ad-hoc 1-hour backfill is therefore no longer needed.
return handle