generated from coulomb/repo-seed
feat(ACTIVITY-WP-0014): explicit run-miss recovery policies (T02, T04)
Set Temporal catchup_window on cron schedules so a fire missed during a worker/Temporal outage is no longer silently dropped. Redefine misfire_policy into three explicit modes — skip, catchup_all, catchup_latest — mapping to (catchup_window, overlap) pairs; legacy catchup/compress aliased. Add catchup_window_seconds override. Remove the ad-hoc upsert-time 1h backfill in favour of native catchup. Apply catchup_latest to daily-statehub-wsjf-triage in the Railiance runtime manifest and document run-miss policies in the runbook. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -49,7 +49,18 @@ class CronTriggerConfig(BaseModel):
|
||||
)
|
||||
timezone: str = Field(default="UTC", description="IANA timezone name.")
|
||||
jitter_seconds: int = Field(default=0, ge=0)
|
||||
misfire_policy: Literal["skip", "catchup", "compress"] = Field(default="skip")
|
||||
# Run-miss recovery behaviour (ACTIVITY-WP-0014). What happens when a fire is
|
||||
# missed because the worker / Temporal was unavailable at trigger time:
|
||||
# skip - run on trigger or skip; a missed fire is never recovered
|
||||
# catchup_all - recover every fire missed during the outage window
|
||||
# catchup_latest - recover only the most recent missed fire; do not accumulate
|
||||
# Legacy aliases are accepted: catchup → catchup_all, compress → catchup_latest.
|
||||
misfire_policy: Literal[
|
||||
"skip", "catchup_all", "catchup_latest", "catchup", "compress"
|
||||
] = Field(default="skip")
|
||||
# Override the per-policy default catchup window (how far back Temporal will
|
||||
# recover missed fires after an outage). None uses the policy default.
|
||||
catchup_window_seconds: int | None = Field(default=None, ge=0)
|
||||
|
||||
|
||||
class EventTriggerConfig(BaseModel):
|
||||
|
||||
@@ -17,7 +17,6 @@ from temporalio.client import (
|
||||
Schedule,
|
||||
ScheduleActionStartWorkflow,
|
||||
ScheduleAlreadyRunningError,
|
||||
ScheduleBackfill,
|
||||
ScheduleCalendarSpec,
|
||||
ScheduleHandle,
|
||||
ScheduleOverlapPolicy,
|
||||
@@ -38,13 +37,49 @@ _ORCHESTRATOR_TASK_QUEUE = "orchestrator-tq"
|
||||
# RunActivityWorkflow detects this value and derives run dedup key from workflow_id.
|
||||
SCHEDULED_TRIGGER_KEY = "scheduled"
|
||||
|
||||
# T24: misfire_policy → ScheduleOverlapPolicy
|
||||
_MISFIRE_TO_OVERLAP: dict[str, ScheduleOverlapPolicy] = {
|
||||
"skip": ScheduleOverlapPolicy.SKIP,
|
||||
"catchup": ScheduleOverlapPolicy.BUFFER_ALL,
|
||||
"compress": ScheduleOverlapPolicy.BUFFER_ONE,
|
||||
# ACTIVITY-WP-0014: misfire_policy → run-miss recovery behaviour.
|
||||
#
|
||||
# A "missed fire" happens when the worker / Temporal is unavailable at trigger
|
||||
# time. Two Temporal levers together define the behaviour:
|
||||
# - catchup_window: how far back the server will recover missed fires once it
|
||||
# is healthy again. The previous code never set this, so a brief outage at
|
||||
# trigger time silently dropped the fire with no recovery and no signal.
|
||||
# - overlap: what to do when a (recovered) fire would start while a prior run
|
||||
# is still executing.
|
||||
#
|
||||
# Legacy values (catchup, compress) are aliased onto the explicit names.
|
||||
_MISFIRE_ALIASES: dict[str, str] = {
|
||||
"catchup": "catchup_all",
|
||||
"compress": "catchup_latest",
|
||||
}
|
||||
|
||||
# overlap policy + default catchup window (seconds) per normalised policy.
|
||||
_SKIP_WINDOW_SECONDS = 60
|
||||
_CATCHUP_ALL_WINDOW_SECONDS = 365 * 24 * 3600
|
||||
_CATCHUP_LATEST_WINDOW_SECONDS = 24 * 3600
|
||||
|
||||
_MISFIRE_TO_OVERLAP: dict[str, ScheduleOverlapPolicy] = {
|
||||
# Run on trigger or skip — recover nothing past a tiny grace window.
|
||||
"skip": ScheduleOverlapPolicy.SKIP,
|
||||
# Run on trigger or recover every missed fire during the outage window.
|
||||
"catchup_all": ScheduleOverlapPolicy.BUFFER_ALL,
|
||||
# Run on trigger or recover the most recent missed fire only; BUFFER_ONE
|
||||
# buffers at most one start and drops the rest, so a backlog never accumulates.
|
||||
"catchup_latest": ScheduleOverlapPolicy.BUFFER_ONE,
|
||||
}
|
||||
|
||||
_MISFIRE_DEFAULT_WINDOW: dict[str, int] = {
|
||||
"skip": _SKIP_WINDOW_SECONDS,
|
||||
"catchup_all": _CATCHUP_ALL_WINDOW_SECONDS,
|
||||
"catchup_latest": _CATCHUP_LATEST_WINDOW_SECONDS,
|
||||
}
|
||||
|
||||
|
||||
def _normalize_misfire_policy(misfire_policy: str) -> str:
|
||||
"""Map legacy aliases onto the explicit run-miss policy names."""
|
||||
canonical = _MISFIRE_ALIASES.get(misfire_policy, misfire_policy)
|
||||
return canonical if canonical in _MISFIRE_TO_OVERLAP else "skip"
|
||||
|
||||
|
||||
def schedule_id(activity_id: str | UUID) -> str:
|
||||
"""Return the canonical Temporal Schedule ID for an ActivityDefinition."""
|
||||
@@ -57,7 +92,15 @@ def smoke_schedule_id(activity_id: str | UUID) -> str:
|
||||
|
||||
|
||||
def _overlap_policy(misfire_policy: str) -> ScheduleOverlapPolicy:
|
||||
return _MISFIRE_TO_OVERLAP.get(misfire_policy, ScheduleOverlapPolicy.SKIP)
|
||||
return _MISFIRE_TO_OVERLAP[_normalize_misfire_policy(misfire_policy)]
|
||||
|
||||
|
||||
def _catchup_window(cfg: CronTriggerConfig) -> timedelta:
|
||||
"""Resolve the catchup window: explicit override, else the policy default."""
|
||||
if cfg.catchup_window_seconds is not None:
|
||||
return timedelta(seconds=cfg.catchup_window_seconds)
|
||||
policy = _normalize_misfire_policy(cfg.misfire_policy)
|
||||
return timedelta(seconds=_MISFIRE_DEFAULT_WINDOW[policy])
|
||||
|
||||
|
||||
def _build_schedule(defn: ActivityDefinition) -> Schedule:
|
||||
@@ -80,7 +123,10 @@ def _build_schedule(defn: ActivityDefinition) -> Schedule:
|
||||
jitter=timedelta(seconds=cfg.jitter_seconds) if cfg.jitter_seconds else None,
|
||||
)
|
||||
|
||||
policy = SchedulePolicy(overlap=_overlap_policy(cfg.misfire_policy))
|
||||
policy = SchedulePolicy(
|
||||
overlap=_overlap_policy(cfg.misfire_policy),
|
||||
catchup_window=_catchup_window(cfg),
|
||||
)
|
||||
state = ScheduleState(paused=not defn.enabled)
|
||||
|
||||
return Schedule(action=action, spec=spec, policy=policy, state=state)
|
||||
@@ -282,18 +328,10 @@ async def upsert_schedule(client: Client, defn: ActivityDefinition) -> ScheduleH
|
||||
else:
|
||||
await handle.pause(note="disabled via upsert_schedule")
|
||||
|
||||
# T24 catchup: backfill any fires missed in the last hour.
|
||||
if isinstance(defn.trigger_config, CronTriggerConfig):
|
||||
if defn.trigger_config.misfire_policy == "catchup":
|
||||
now = datetime.now(tz=timezone.utc)
|
||||
backfill_start = now - timedelta(hours=1)
|
||||
await handle.backfill(
|
||||
ScheduleBackfill(
|
||||
start_at=backfill_start,
|
||||
end_at=now,
|
||||
overlap=ScheduleOverlapPolicy.BUFFER_ALL,
|
||||
)
|
||||
)
|
||||
# ACTIVITY-WP-0014: missed-fire recovery is now handled natively by the
|
||||
# schedule's catchup_window (see _build_schedule), which the server applies
|
||||
# continuously after any outage — not only at upsert time. The previous
|
||||
# ad-hoc 1-hour backfill is therefore no longer needed.
|
||||
|
||||
return handle
|
||||
|
||||
|
||||
Reference in New Issue
Block a user