generated from coulomb/repo-seed
feat(WP-0002): complete Triggers & Ops workstream
Delivers all 12 tasks (T22–T33): Temporal Schedule manager + startup sync, NATS JetStream event router, FastAPI CRUD + manual trigger, Prometheus metrics wiring, custom search-attribute tagging, and operational runbook. Marks workplan status as done. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
266
src/activity_core/api.py
Normal file
266
src/activity_core/api.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""FastAPI REST API for activity-core.
|
||||
|
||||
T30: CRUD for ActivityDefinition + manual one-shot trigger.
|
||||
|
||||
Endpoints:
|
||||
GET /activity-definitions/ — list all
|
||||
GET /activity-definitions/{id} — get one
|
||||
POST /activity-definitions/ — create
|
||||
PUT /activity-definitions/{id} — update
|
||||
DELETE /activity-definitions/{id} — delete
|
||||
POST /activity-definitions/{id}/trigger — manual one-shot run
|
||||
|
||||
Schedule lifecycle:
|
||||
- POST/PUT with trigger_type='cron' upserts a Temporal Schedule.
|
||||
- DELETE removes the Temporal Schedule if present.
|
||||
- /trigger starts RunActivityWorkflow directly (works for any trigger_type).
|
||||
|
||||
Run with:
|
||||
TEMPORAL_HOST=localhost:7233 \
|
||||
ACTCORE_DB_URL=postgresql+asyncpg://actcore:actcore@localhost:5433/actcore \
|
||||
uv run uvicorn activity_core.api:app --port 8010
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import FastAPI, HTTPException
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
from temporalio.client import Client
|
||||
|
||||
from activity_core.models import ActivityDefinition, CronTriggerConfig
|
||||
from activity_core.orm import ActivityDefinition as ActivityDefinitionRow
|
||||
from activity_core.schedule_manager import delete_schedule, upsert_schedule
|
||||
|
||||
TEMPORAL_HOST = os.environ.get("TEMPORAL_HOST", "localhost:7233")
|
||||
TEMPORAL_NAMESPACE = os.environ.get("TEMPORAL_NAMESPACE", "default")
|
||||
_ORCHESTRATOR_TASK_QUEUE = "orchestrator-tq"
|
||||
|
||||
# --- App state ---------------------------------------------------------------
|
||||
|
||||
_session_factory: async_sessionmaker[AsyncSession] | None = None
|
||||
_temporal_client: Client | None = None
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI): # type: ignore[type-arg]
|
||||
global _session_factory, _temporal_client
|
||||
|
||||
db_url = os.environ.get("ACTCORE_DB_URL")
|
||||
if not db_url:
|
||||
raise RuntimeError("ACTCORE_DB_URL is required")
|
||||
|
||||
engine = create_async_engine(db_url)
|
||||
_session_factory = async_sessionmaker(engine, expire_on_commit=False)
|
||||
_temporal_client = await Client.connect(TEMPORAL_HOST, namespace=TEMPORAL_NAMESPACE)
|
||||
|
||||
yield
|
||||
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
app = FastAPI(title="activity-core API", lifespan=lifespan)
|
||||
|
||||
|
||||
def _get_db() -> async_sessionmaker[AsyncSession]:
|
||||
assert _session_factory is not None
|
||||
return _session_factory
|
||||
|
||||
|
||||
def _get_temporal() -> Client:
|
||||
assert _temporal_client is not None
|
||||
return _temporal_client
|
||||
|
||||
|
||||
# --- Schemas -----------------------------------------------------------------
|
||||
|
||||
class ActivityDefinitionCreate(BaseModel):
|
||||
name: str
|
||||
enabled: bool = True
|
||||
trigger_config: dict[str, Any]
|
||||
context_sources: list[dict[str, Any]] = []
|
||||
task_templates: list[dict[str, Any]] = []
|
||||
dedupe_key_strategy: str = "skip"
|
||||
version: int = 1
|
||||
|
||||
|
||||
class ActivityDefinitionUpdate(BaseModel):
|
||||
name: str | None = None
|
||||
enabled: bool | None = None
|
||||
trigger_config: dict[str, Any] | None = None
|
||||
context_sources: list[dict[str, Any]] | None = None
|
||||
task_templates: list[dict[str, Any]] | None = None
|
||||
dedupe_key_strategy: str | None = None
|
||||
version: int | None = None
|
||||
|
||||
|
||||
class ActivityDefinitionResponse(BaseModel):
|
||||
id: uuid.UUID
|
||||
name: str
|
||||
enabled: bool
|
||||
trigger_type: str
|
||||
trigger_config: dict[str, Any]
|
||||
context_sources: list[dict[str, Any]]
|
||||
task_templates: list[dict[str, Any]]
|
||||
dedupe_key_strategy: str
|
||||
version: int
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
||||
def _row_to_response(row: ActivityDefinitionRow) -> ActivityDefinitionResponse:
|
||||
return ActivityDefinitionResponse(
|
||||
id=row.id,
|
||||
name=row.name,
|
||||
enabled=row.enabled,
|
||||
trigger_type=row.trigger_type,
|
||||
trigger_config=row.trigger_config,
|
||||
context_sources=row.context_sources,
|
||||
task_templates=row.task_templates,
|
||||
dedupe_key_strategy=row.dedupe_key_strategy,
|
||||
version=row.version,
|
||||
created_at=row.created_at,
|
||||
updated_at=row.updated_at,
|
||||
)
|
||||
|
||||
|
||||
async def _upsert_schedule_if_cron(row: ActivityDefinitionRow) -> None:
|
||||
"""Upsert a Temporal Schedule for the row if it uses a cron trigger."""
|
||||
try:
|
||||
defn = ActivityDefinition.model_validate(
|
||||
{
|
||||
"id": row.id,
|
||||
"name": row.name,
|
||||
"enabled": row.enabled,
|
||||
"trigger_config": row.trigger_config,
|
||||
"context_sources": row.context_sources,
|
||||
"task_templates": row.task_templates,
|
||||
"dedupe_key_strategy": row.dedupe_key_strategy,
|
||||
"version": row.version,
|
||||
}
|
||||
)
|
||||
if isinstance(defn.trigger_config, CronTriggerConfig):
|
||||
await upsert_schedule(_get_temporal(), defn)
|
||||
except Exception:
|
||||
pass # Schedule management is best-effort; don't fail the API call.
|
||||
|
||||
|
||||
# --- Routes ------------------------------------------------------------------
|
||||
|
||||
@app.get("/activity-definitions/", response_model=list[ActivityDefinitionResponse])
|
||||
async def list_definitions() -> list[ActivityDefinitionResponse]:
|
||||
"""List all ActivityDefinitions."""
|
||||
Session = _get_db()
|
||||
async with Session() as session:
|
||||
rows = (await session.scalars(select(ActivityDefinitionRow))).all()
|
||||
return [_row_to_response(r) for r in rows]
|
||||
|
||||
|
||||
@app.get("/activity-definitions/{definition_id}", response_model=ActivityDefinitionResponse)
|
||||
async def get_definition(definition_id: uuid.UUID) -> ActivityDefinitionResponse:
|
||||
"""Get one ActivityDefinition by ID."""
|
||||
Session = _get_db()
|
||||
async with Session() as session:
|
||||
row = await session.get(ActivityDefinitionRow, definition_id)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="ActivityDefinition not found")
|
||||
return _row_to_response(row)
|
||||
|
||||
|
||||
@app.post("/activity-definitions/", response_model=ActivityDefinitionResponse, status_code=201)
|
||||
async def create_definition(body: ActivityDefinitionCreate) -> ActivityDefinitionResponse:
|
||||
"""Create a new ActivityDefinition. Upserts a Temporal Schedule if trigger_type='cron'."""
|
||||
trigger_type = body.trigger_config.get("trigger_type", "")
|
||||
row = ActivityDefinitionRow(
|
||||
id=uuid.uuid4(),
|
||||
name=body.name,
|
||||
enabled=body.enabled,
|
||||
trigger_type=trigger_type,
|
||||
trigger_config=body.trigger_config,
|
||||
context_sources=body.context_sources,
|
||||
task_templates=body.task_templates,
|
||||
dedupe_key_strategy=body.dedupe_key_strategy,
|
||||
version=body.version,
|
||||
)
|
||||
Session = _get_db()
|
||||
async with Session() as session:
|
||||
async with session.begin():
|
||||
session.add(row)
|
||||
await _upsert_schedule_if_cron(row)
|
||||
return _row_to_response(row)
|
||||
|
||||
|
||||
@app.put("/activity-definitions/{definition_id}", response_model=ActivityDefinitionResponse)
|
||||
async def update_definition(
|
||||
definition_id: uuid.UUID, body: ActivityDefinitionUpdate
|
||||
) -> ActivityDefinitionResponse:
|
||||
"""Update an ActivityDefinition. Re-upserts the Temporal Schedule if trigger_type='cron'."""
|
||||
Session = _get_db()
|
||||
async with Session() as session:
|
||||
row = await session.get(ActivityDefinitionRow, definition_id)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="ActivityDefinition not found")
|
||||
|
||||
if body.name is not None:
|
||||
row.name = body.name
|
||||
if body.enabled is not None:
|
||||
row.enabled = body.enabled
|
||||
if body.trigger_config is not None:
|
||||
row.trigger_config = body.trigger_config
|
||||
row.trigger_type = body.trigger_config.get("trigger_type", row.trigger_type)
|
||||
if body.context_sources is not None:
|
||||
row.context_sources = body.context_sources
|
||||
if body.task_templates is not None:
|
||||
row.task_templates = body.task_templates
|
||||
if body.dedupe_key_strategy is not None:
|
||||
row.dedupe_key_strategy = body.dedupe_key_strategy
|
||||
if body.version is not None:
|
||||
row.version = body.version
|
||||
|
||||
async with session.begin():
|
||||
session.add(row)
|
||||
|
||||
await _upsert_schedule_if_cron(row)
|
||||
return _row_to_response(row)
|
||||
|
||||
|
||||
@app.delete("/activity-definitions/{definition_id}", status_code=204)
|
||||
async def delete_definition(definition_id: uuid.UUID) -> None:
|
||||
"""Delete an ActivityDefinition and its Temporal Schedule if present."""
|
||||
Session = _get_db()
|
||||
async with Session() as session:
|
||||
row = await session.get(ActivityDefinitionRow, definition_id)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="ActivityDefinition not found")
|
||||
async with session.begin():
|
||||
await session.delete(row)
|
||||
|
||||
await delete_schedule(_get_temporal(), definition_id)
|
||||
|
||||
|
||||
@app.post("/activity-definitions/{definition_id}/trigger", status_code=202)
|
||||
async def trigger_definition(definition_id: uuid.UUID) -> dict[str, str]:
|
||||
"""Manually trigger a one-shot RunActivityWorkflow for any ActivityDefinition."""
|
||||
Session = _get_db()
|
||||
async with Session() as session:
|
||||
row = await session.get(ActivityDefinitionRow, definition_id)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="ActivityDefinition not found")
|
||||
|
||||
trigger_key = f"manual-{uuid.uuid4()}"
|
||||
workflow_id = f"activity-{definition_id}:{trigger_key}"
|
||||
|
||||
handle = await _get_temporal().start_workflow(
|
||||
"RunActivityWorkflow",
|
||||
args=[str(definition_id), trigger_key, datetime.now(tz=timezone.utc).isoformat()],
|
||||
id=workflow_id,
|
||||
task_queue=_ORCHESTRATOR_TASK_QUEUE,
|
||||
)
|
||||
return {"workflow_id": handle.id, "trigger_key": trigger_key}
|
||||
226
src/activity_core/event_router.py
Normal file
226
src/activity_core/event_router.py
Normal file
@@ -0,0 +1,226 @@
|
||||
"""Event Router — NATS JetStream consumer that routes events to RunActivityWorkflow.
|
||||
|
||||
T26: EventRouter class — connects to NATS JetStream, subscribes to activity.>
|
||||
T27: Routing rules — match event.type + payload filters to enabled ActivityDefinitions
|
||||
T28: Start/signal workflow from Event Router with idempotent workflow ID
|
||||
|
||||
Stream: ACTIVITY_EVENTS
|
||||
Subject: activity.>
|
||||
Consumer: activity-core-event-router (durable, push-based)
|
||||
|
||||
Message ack happens only after the workflow has been successfully started,
|
||||
giving at-least-once delivery semantics.
|
||||
|
||||
Usage:
|
||||
NATS_URL=nats://localhost:4222 \
|
||||
ACTCORE_DB_URL=postgresql+asyncpg://... \
|
||||
TEMPORAL_HOST=localhost:7233 \
|
||||
python -m activity_core.event_router
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
import nats
|
||||
import nats.js.api
|
||||
from nats.aio.client import Client as NATSClient
|
||||
from nats.js.client import JetStreamContext
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
from temporalio.client import Client as TemporalClient
|
||||
from temporalio.common import WorkflowIDConflictPolicy
|
||||
from temporalio.exceptions import WorkflowAlreadyStartedError
|
||||
|
||||
from activity_core.models import EventEnvelope, EventTriggerConfig
|
||||
from activity_core.orm import ActivityDefinition as ActivityDefinitionRow
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
NATS_URL = os.environ.get("NATS_URL", "nats://localhost:4222")
|
||||
TEMPORAL_HOST = os.environ.get("TEMPORAL_HOST", "localhost:7233")
|
||||
TEMPORAL_NAMESPACE = os.environ.get("TEMPORAL_NAMESPACE", "default")
|
||||
|
||||
_STREAM_NAME = "ACTIVITY_EVENTS"
|
||||
_SUBJECT = "activity.>"
|
||||
_CONSUMER_NAME = "activity-core-event-router"
|
||||
_ORCHESTRATOR_TASK_QUEUE = "orchestrator-tq"
|
||||
|
||||
|
||||
class EventRouter:
|
||||
"""Subscribes to NATS JetStream and routes incoming events to Temporal workflows."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
nats_url: str,
|
||||
temporal_client: TemporalClient,
|
||||
db_url: str,
|
||||
) -> None:
|
||||
self._nats_url = nats_url
|
||||
self._temporal = temporal_client
|
||||
self._db_url = db_url
|
||||
self._nc: NATSClient | None = None
|
||||
self._js: JetStreamContext | None = None
|
||||
self._session_factory: async_sessionmaker[AsyncSession] | None = None
|
||||
|
||||
async def _ensure_stream(self, js: JetStreamContext) -> None:
|
||||
"""Create the ACTIVITY_EVENTS stream if it does not exist."""
|
||||
try:
|
||||
await js.find_stream(_SUBJECT)
|
||||
except Exception:
|
||||
await js.add_stream(
|
||||
nats.js.api.StreamConfig(
|
||||
name=_STREAM_NAME,
|
||||
subjects=[_SUBJECT],
|
||||
)
|
||||
)
|
||||
logger.info("created JetStream stream %r", _STREAM_NAME)
|
||||
|
||||
# T27: Load all enabled event-trigger ActivityDefinitions from DB.
|
||||
async def _load_event_definitions(
|
||||
self,
|
||||
) -> list[tuple[str, EventTriggerConfig]]:
|
||||
"""Return list of (activity_id, EventTriggerConfig) for enabled event defs."""
|
||||
assert self._session_factory is not None
|
||||
async with self._session_factory() as session:
|
||||
rows = (
|
||||
await session.scalars(
|
||||
select(ActivityDefinitionRow).where(
|
||||
ActivityDefinitionRow.trigger_type == "event",
|
||||
ActivityDefinitionRow.enabled.is_(True),
|
||||
)
|
||||
)
|
||||
).all()
|
||||
|
||||
result = []
|
||||
for row in rows:
|
||||
try:
|
||||
cfg = EventTriggerConfig.model_validate(row.trigger_config)
|
||||
result.append((str(row.id), cfg))
|
||||
except Exception:
|
||||
logger.warning("skipping malformed trigger_config for activity %s", row.id)
|
||||
return result
|
||||
|
||||
# T27: Match an envelope against the routing rules.
|
||||
def _matches(self, envelope: EventEnvelope, cfg: EventTriggerConfig) -> bool:
|
||||
"""Return True if the envelope matches the EventTriggerConfig."""
|
||||
if envelope.type != cfg.event_type:
|
||||
return False
|
||||
# All filter key/value pairs must be present in envelope.payload.
|
||||
for key, value in cfg.filters.items():
|
||||
if envelope.payload.get(key) != value:
|
||||
return False
|
||||
return True
|
||||
|
||||
# T28: Start RunActivityWorkflow for a matched activity.
|
||||
async def _dispatch(self, activity_id: str, envelope: EventEnvelope) -> None:
|
||||
"""Start RunActivityWorkflow for one matched activity.
|
||||
|
||||
Workflow ID is deterministic: activity-{activity_id}:{event_id}
|
||||
REJECT_DUPLICATE prevents double-processing if the message is redelivered
|
||||
before ack reaches NATS.
|
||||
"""
|
||||
workflow_id = f"activity-{activity_id}:{envelope.event_id}"
|
||||
try:
|
||||
await self._temporal.start_workflow(
|
||||
"RunActivityWorkflow",
|
||||
args=[activity_id, envelope.event_id, envelope.occurred_at.isoformat()],
|
||||
id=workflow_id,
|
||||
task_queue=_ORCHESTRATOR_TASK_QUEUE,
|
||||
id_conflict_policy=WorkflowIDConflictPolicy.FAIL,
|
||||
)
|
||||
logger.info(
|
||||
"started workflow %r for event %r (activity %s)",
|
||||
workflow_id,
|
||||
envelope.event_id,
|
||||
activity_id,
|
||||
)
|
||||
except WorkflowAlreadyStartedError:
|
||||
# Duplicate delivery — workflow already running or completed; safe to skip.
|
||||
logger.debug("duplicate event %r for activity %s — skipped", envelope.event_id, activity_id)
|
||||
|
||||
async def _handle_message(self, msg: Any) -> None:
|
||||
"""Decode a NATS message, match it against routing rules, and dispatch."""
|
||||
try:
|
||||
raw = json.loads(msg.data.decode())
|
||||
envelope = EventEnvelope.model_validate(raw)
|
||||
except Exception:
|
||||
logger.warning("failed to parse event envelope from NATS message — nacking")
|
||||
await msg.nak()
|
||||
return
|
||||
|
||||
# T27: Reload routing table per message so hot changes take effect.
|
||||
event_defs = await self._load_event_definitions()
|
||||
matched = [aid for aid, cfg in event_defs if self._matches(envelope, cfg)]
|
||||
|
||||
if not matched:
|
||||
logger.debug("event %r type=%r matched no definitions", envelope.event_id, envelope.type)
|
||||
await msg.ack()
|
||||
return
|
||||
|
||||
# T28: Start a workflow for each matched activity.
|
||||
for activity_id in matched:
|
||||
await self._dispatch(activity_id, envelope)
|
||||
|
||||
# Ack only after all dispatches succeed (at-least-once guarantee).
|
||||
await msg.ack()
|
||||
|
||||
async def start(self) -> None:
|
||||
"""Connect to NATS, set up the stream/consumer, and begin processing.
|
||||
|
||||
Blocks until cancelled.
|
||||
"""
|
||||
engine = create_async_engine(self._db_url)
|
||||
self._session_factory = async_sessionmaker(engine, expire_on_commit=False)
|
||||
|
||||
self._nc = await nats.connect(self._nats_url)
|
||||
self._js = self._nc.jetstream()
|
||||
|
||||
await self._ensure_stream(self._js)
|
||||
|
||||
# Durable push consumer — survives restarts, replays unacked messages.
|
||||
sub = await self._js.subscribe(
|
||||
_SUBJECT,
|
||||
durable=_CONSUMER_NAME,
|
||||
cb=self._handle_message,
|
||||
manual_ack=True,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"EventRouter listening on subject %r (stream=%r, consumer=%r)",
|
||||
_SUBJECT,
|
||||
_STREAM_NAME,
|
||||
_CONSUMER_NAME,
|
||||
)
|
||||
|
||||
try:
|
||||
await asyncio.Future() # run until cancelled
|
||||
finally:
|
||||
await sub.unsubscribe()
|
||||
await self._nc.drain()
|
||||
await engine.dispose()
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
db_url = os.environ.get("ACTCORE_DB_URL")
|
||||
if not db_url:
|
||||
raise RuntimeError("ACTCORE_DB_URL is required")
|
||||
|
||||
temporal_client = await TemporalClient.connect(TEMPORAL_HOST, namespace=TEMPORAL_NAMESPACE)
|
||||
router = EventRouter(
|
||||
nats_url=NATS_URL,
|
||||
temporal_client=temporal_client,
|
||||
db_url=db_url,
|
||||
)
|
||||
await router.start()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
163
src/activity_core/schedule_manager.py
Normal file
163
src/activity_core/schedule_manager.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""Temporal Schedule management for activity-core.
|
||||
|
||||
T22: upsert_schedule, delete_schedule, list_schedules
|
||||
T24: misfire_policy → ScheduleOverlapPolicy mapping (all three policies)
|
||||
|
||||
Schedule ID convention: activity-schedule-{activity_definition.id}
|
||||
Workflow triggered: RunActivityWorkflow on orchestrator-tq
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from uuid import UUID
|
||||
|
||||
from temporalio.client import (
|
||||
Client,
|
||||
Schedule,
|
||||
ScheduleActionStartWorkflow,
|
||||
ScheduleBackfill,
|
||||
ScheduleHandle,
|
||||
ScheduleOverlapPolicy,
|
||||
SchedulePolicy,
|
||||
ScheduleSpec,
|
||||
ScheduleState,
|
||||
ScheduleUpdate,
|
||||
ScheduleUpdateInput,
|
||||
)
|
||||
from temporalio.service import RPCError
|
||||
|
||||
from activity_core.models import ActivityDefinition, CronTriggerConfig
|
||||
|
||||
_ORCHESTRATOR_TASK_QUEUE = "orchestrator-tq"
|
||||
|
||||
# Trigger_key sentinel used when a workflow is started by a Temporal Schedule.
|
||||
# RunActivityWorkflow detects this value and derives run dedup key from workflow_id.
|
||||
SCHEDULED_TRIGGER_KEY = "scheduled"
|
||||
|
||||
# T24: misfire_policy → ScheduleOverlapPolicy
|
||||
_MISFIRE_TO_OVERLAP: dict[str, ScheduleOverlapPolicy] = {
|
||||
"skip": ScheduleOverlapPolicy.SKIP,
|
||||
"catchup": ScheduleOverlapPolicy.BUFFER_ALL,
|
||||
"compress": ScheduleOverlapPolicy.BUFFER_ONE,
|
||||
}
|
||||
|
||||
|
||||
def schedule_id(activity_id: str | UUID) -> str:
|
||||
"""Return the canonical Temporal Schedule ID for an ActivityDefinition."""
|
||||
return f"activity-schedule-{activity_id}"
|
||||
|
||||
|
||||
def _overlap_policy(misfire_policy: str) -> ScheduleOverlapPolicy:
|
||||
return _MISFIRE_TO_OVERLAP.get(misfire_policy, ScheduleOverlapPolicy.SKIP)
|
||||
|
||||
|
||||
def _build_schedule(defn: ActivityDefinition) -> Schedule:
|
||||
"""Construct a Temporal Schedule object from a cron ActivityDefinition."""
|
||||
assert isinstance(defn.trigger_config, CronTriggerConfig)
|
||||
cfg: CronTriggerConfig = defn.trigger_config
|
||||
|
||||
# Workflow ID uses ${firstScheduledTime} so each schedule fire gets a
|
||||
# unique workflow ID, enabling replay/audit without ID conflicts.
|
||||
action = ScheduleActionStartWorkflow(
|
||||
"RunActivityWorkflow",
|
||||
args=[str(defn.id), SCHEDULED_TRIGGER_KEY, None],
|
||||
id=f"activity-{defn.id}:${{firstScheduledTime}}",
|
||||
task_queue=_ORCHESTRATOR_TASK_QUEUE,
|
||||
)
|
||||
|
||||
spec = ScheduleSpec(
|
||||
cron_expressions=[cfg.cron_expression],
|
||||
timezone_name=cfg.timezone,
|
||||
jitter=timedelta(seconds=cfg.jitter_seconds) if cfg.jitter_seconds else None,
|
||||
)
|
||||
|
||||
policy = SchedulePolicy(overlap=_overlap_policy(cfg.misfire_policy))
|
||||
state = ScheduleState(paused=not defn.enabled)
|
||||
|
||||
return Schedule(action=action, spec=spec, policy=policy, state=state)
|
||||
|
||||
|
||||
async def upsert_schedule(client: Client, defn: ActivityDefinition) -> ScheduleHandle:
|
||||
"""Create or update a Temporal Schedule for a cron ActivityDefinition.
|
||||
|
||||
- Only operates on definitions with trigger_type='cron'.
|
||||
- If enabled=False the schedule is created paused.
|
||||
- For misfire_policy='catchup', triggers a backfill covering the last hour
|
||||
after each upsert to replay any recently missed fires.
|
||||
|
||||
Returns the ScheduleHandle for the created/updated schedule.
|
||||
"""
|
||||
if not isinstance(defn.trigger_config, CronTriggerConfig):
|
||||
raise ValueError(
|
||||
f"upsert_schedule requires trigger_type='cron', "
|
||||
f"got {defn.trigger_config.trigger_type!r}"
|
||||
)
|
||||
|
||||
sid = schedule_id(defn.id)
|
||||
sched = _build_schedule(defn)
|
||||
|
||||
try:
|
||||
handle = await client.create_schedule(sid, sched)
|
||||
except RPCError:
|
||||
# Schedule already exists — update it in place.
|
||||
handle = client.get_schedule_handle(sid)
|
||||
|
||||
async def _updater(input: ScheduleUpdateInput) -> ScheduleUpdate: # noqa: ARG001
|
||||
return ScheduleUpdate(schedule=sched)
|
||||
|
||||
await handle.update(_updater)
|
||||
|
||||
# Sync pause state explicitly (update replaces the schedule object
|
||||
# but pause state is part of ScheduleState, already embedded above).
|
||||
if defn.enabled:
|
||||
await handle.unpause()
|
||||
else:
|
||||
await handle.pause(note="disabled via upsert_schedule")
|
||||
|
||||
# T24 catchup: backfill any fires missed in the last hour.
|
||||
if isinstance(defn.trigger_config, CronTriggerConfig):
|
||||
if defn.trigger_config.misfire_policy == "catchup":
|
||||
now = datetime.now(tz=timezone.utc)
|
||||
backfill_start = now - timedelta(hours=1)
|
||||
await handle.backfill(
|
||||
[
|
||||
ScheduleBackfill(
|
||||
start_at=backfill_start,
|
||||
end_at=now,
|
||||
overlap=ScheduleOverlapPolicy.BUFFER_ALL,
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
return handle
|
||||
|
||||
|
||||
async def delete_schedule(client: Client, activity_id: str | UUID) -> None:
|
||||
"""Delete the Temporal Schedule for the given activity_id.
|
||||
|
||||
No-op if the schedule does not exist.
|
||||
"""
|
||||
handle = client.get_schedule_handle(schedule_id(activity_id))
|
||||
try:
|
||||
await handle.delete()
|
||||
except RPCError:
|
||||
pass # Not found — treat as success.
|
||||
|
||||
|
||||
async def list_schedules(client: Client) -> list[dict]:
|
||||
"""Enumerate all activity-core Temporal Schedules.
|
||||
|
||||
Returns a list of dicts: [{"schedule_id": str, "activity_id": str}, ...]
|
||||
"""
|
||||
prefix = "activity-schedule-"
|
||||
results: list[dict] = []
|
||||
async for entry in await client.list_schedules():
|
||||
if entry.id.startswith(prefix):
|
||||
results.append(
|
||||
{
|
||||
"schedule_id": entry.id,
|
||||
"activity_id": entry.id[len(prefix) :],
|
||||
}
|
||||
)
|
||||
return results
|
||||
123
src/activity_core/sync_schedules.py
Normal file
123
src/activity_core/sync_schedules.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""Bootstrap script: sync Temporal Schedules with the ActivityDefinition DB.
|
||||
|
||||
T23: On startup, ensures every enabled cron ActivityDefinition has a live
|
||||
Temporal Schedule, and removes orphaned schedules that have no matching DB row.
|
||||
|
||||
Run directly:
|
||||
ACTCORE_DB_URL=... uv run python -m activity_core.sync_schedules
|
||||
|
||||
Also called from worker.py before the worker enters its run loop.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import uuid
|
||||
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
||||
from temporalio.client import Client
|
||||
|
||||
from activity_core.models import ActivityDefinition, CronTriggerConfig
|
||||
from activity_core.orm import ActivityDefinition as ActivityDefinitionRow
|
||||
from activity_core.schedule_manager import delete_schedule, list_schedules, upsert_schedule
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TEMPORAL_HOST = os.environ.get("TEMPORAL_HOST", "localhost:7233")
|
||||
TEMPORAL_NAMESPACE = os.environ.get("TEMPORAL_NAMESPACE", "default")
|
||||
|
||||
|
||||
def _row_to_domain(row: ActivityDefinitionRow) -> ActivityDefinition:
|
||||
"""Convert an ORM row to a domain ActivityDefinition for schedule_manager."""
|
||||
return ActivityDefinition.model_validate(
|
||||
{
|
||||
"id": row.id,
|
||||
"name": row.name,
|
||||
"enabled": row.enabled,
|
||||
"trigger_config": row.trigger_config,
|
||||
"context_sources": row.context_sources,
|
||||
"task_templates": row.task_templates,
|
||||
"dedupe_key_strategy": row.dedupe_key_strategy,
|
||||
"version": row.version,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def sync(client: Client, db_url: str) -> None:
|
||||
"""Reconcile Temporal Schedules against the ActivityDefinition table.
|
||||
|
||||
Steps:
|
||||
1. Load all enabled cron ActivityDefinitions from Postgres.
|
||||
2. Upsert a Temporal Schedule for each one.
|
||||
3. Delete Temporal Schedules whose activity_id has no matching DB row
|
||||
(tombstone cleanup for deleted or trigger-type-changed definitions).
|
||||
"""
|
||||
engine = create_async_engine(db_url)
|
||||
session_factory = async_sessionmaker(engine, expire_on_commit=False)
|
||||
|
||||
try:
|
||||
async with session_factory() as session:
|
||||
rows = (
|
||||
await session.scalars(
|
||||
select(ActivityDefinitionRow).where(
|
||||
ActivityDefinitionRow.trigger_type == "cron"
|
||||
)
|
||||
)
|
||||
).all()
|
||||
finally:
|
||||
await engine.dispose()
|
||||
|
||||
db_activity_ids: set[str] = set()
|
||||
upserted = 0
|
||||
skipped = 0
|
||||
|
||||
for row in rows:
|
||||
defn = _row_to_domain(row)
|
||||
if not isinstance(defn.trigger_config, CronTriggerConfig):
|
||||
continue # should not happen given the WHERE clause, but guard anyway
|
||||
|
||||
db_activity_ids.add(str(defn.id))
|
||||
|
||||
if defn.enabled:
|
||||
await upsert_schedule(client, defn)
|
||||
upserted += 1
|
||||
logger.info("upserted schedule for activity %s (%s)", defn.id, defn.name)
|
||||
else:
|
||||
# Disabled definitions: schedule may exist (paused) — leave it;
|
||||
# upsert_schedule already handles the paused state.
|
||||
await upsert_schedule(client, defn)
|
||||
skipped += 1
|
||||
logger.info("upserted paused schedule for disabled activity %s", defn.id)
|
||||
|
||||
# Tombstone cleanup: remove Temporal Schedules with no matching DB row.
|
||||
existing_schedules = await list_schedules(client)
|
||||
deleted = 0
|
||||
for entry in existing_schedules:
|
||||
if entry["activity_id"] not in db_activity_ids:
|
||||
await delete_schedule(client, entry["activity_id"])
|
||||
deleted += 1
|
||||
logger.info("deleted orphaned schedule %s", entry["schedule_id"])
|
||||
|
||||
logger.info(
|
||||
"sync_schedules complete — upserted=%d skipped_disabled=%d deleted_orphans=%d",
|
||||
upserted,
|
||||
skipped,
|
||||
deleted,
|
||||
)
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
db_url = os.environ.get("ACTCORE_DB_URL")
|
||||
if not db_url:
|
||||
raise RuntimeError("ACTCORE_DB_URL is required")
|
||||
|
||||
client = await Client.connect(TEMPORAL_HOST, namespace=TEMPORAL_NAMESPACE)
|
||||
await sync(client, db_url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -4,23 +4,31 @@ Starts two workers (wired up in T20):
|
||||
- orchestrator-tq: RunActivityWorkflow + its activities
|
||||
- task-execution-tq: TaskExecutorWorkflow
|
||||
|
||||
T23: Calls sync_schedules before entering the worker run loop to ensure
|
||||
all cron ActivityDefinitions have live Temporal Schedules.
|
||||
|
||||
T31: Exposes Prometheus metrics via the Temporal SDK runtime on :9090/metrics.
|
||||
|
||||
Run with:
|
||||
TEMPORAL_HOST=localhost:7233 \
|
||||
ACTCORE_DB_URL=postgresql+asyncpg://actcore:actcore@localhost:5433/actcore \
|
||||
python -m activity_core.worker
|
||||
|
||||
Environment variables:
|
||||
TEMPORAL_HOST Temporal frontend address (default: localhost:7233)
|
||||
TEMPORAL_NAMESPACE Temporal namespace (default: default)
|
||||
ACTCORE_DB_URL App DB connection string (required)
|
||||
TEMPORAL_HOST Temporal frontend address (default: localhost:7233)
|
||||
TEMPORAL_NAMESPACE Temporal namespace (default: default)
|
||||
ACTCORE_DB_URL App DB connection string (required)
|
||||
PROMETHEUS_BIND_ADDR Prometheus metrics bind (default: 0.0.0.0:9090)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
|
||||
from temporalio.client import Client
|
||||
from temporalio.runtime import PrometheusConfig, Runtime, TelemetryConfig
|
||||
from temporalio.worker import Worker
|
||||
|
||||
from activity_core.activities import (
|
||||
@@ -30,10 +38,14 @@ from activity_core.activities import (
|
||||
persist_task_instance,
|
||||
resolve_context,
|
||||
)
|
||||
from activity_core.sync_schedules import sync as sync_schedules
|
||||
from activity_core.workflows import RunActivityWorkflow, TaskExecutorWorkflow
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
TEMPORAL_HOST = os.environ.get("TEMPORAL_HOST", "localhost:7233")
|
||||
TEMPORAL_NAMESPACE = os.environ.get("TEMPORAL_NAMESPACE", "default")
|
||||
PROMETHEUS_BIND_ADDR = os.environ.get("PROMETHEUS_BIND_ADDR", "0.0.0.0:9090")
|
||||
|
||||
ORCHESTRATOR_TASK_QUEUE = "orchestrator-tq"
|
||||
TASK_EXECUTION_TASK_QUEUE = "task-execution-tq"
|
||||
@@ -45,7 +57,23 @@ async def run() -> None:
|
||||
raise RuntimeError("ACTCORE_DB_URL is required")
|
||||
init_session_factory(db_url)
|
||||
|
||||
client = await Client.connect(TEMPORAL_HOST, namespace=TEMPORAL_NAMESPACE)
|
||||
# T31: Configure the Temporal SDK runtime to emit metrics in Prometheus format.
|
||||
runtime = Runtime(
|
||||
telemetry=TelemetryConfig(
|
||||
metrics=PrometheusConfig(bind_address=PROMETHEUS_BIND_ADDR)
|
||||
)
|
||||
)
|
||||
|
||||
client = await Client.connect(
|
||||
TEMPORAL_HOST, namespace=TEMPORAL_NAMESPACE, runtime=runtime
|
||||
)
|
||||
|
||||
# T23: Sync Temporal Schedules with the DB before workers start accepting tasks.
|
||||
logger.info("Syncing Temporal Schedules with ActivityDefinition DB...")
|
||||
try:
|
||||
await sync_schedules(client, db_url)
|
||||
except Exception:
|
||||
logger.exception("schedule sync failed — continuing worker startup")
|
||||
|
||||
orchestrator_worker = Worker(
|
||||
client,
|
||||
@@ -70,4 +98,5 @@ async def run() -> None:
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
asyncio.run(run())
|
||||
|
||||
@@ -15,7 +15,7 @@ import uuid
|
||||
from datetime import timedelta
|
||||
|
||||
from temporalio import workflow
|
||||
from temporalio.common import RetryPolicy
|
||||
from temporalio.common import RetryPolicy, SearchAttributeKey, TypedSearchAttributes, SearchAttributePair
|
||||
|
||||
with workflow.unsafe.imports_passed_through():
|
||||
from activity_core.activities import (
|
||||
@@ -25,6 +25,12 @@ with workflow.unsafe.imports_passed_through():
|
||||
resolve_context,
|
||||
)
|
||||
from activity_core.template_engine import evaluate_templates
|
||||
from activity_core.schedule_manager import SCHEDULED_TRIGGER_KEY
|
||||
|
||||
# T32: Custom search attributes for Temporal visibility (must be registered in Temporal first).
|
||||
# Registration: temporal operator search-attribute create --name ActivityId --type Keyword
|
||||
_ACTIVITY_ID_KEY = SearchAttributeKey.for_keyword("ActivityId")
|
||||
_ACTIVITY_NAME_KEY = SearchAttributeKey.for_keyword("ActivityName")
|
||||
|
||||
_RETRY_POLICY = RetryPolicy(
|
||||
initial_interval=timedelta(seconds=1),
|
||||
@@ -74,6 +80,16 @@ class RunActivityWorkflow:
|
||||
retry_policy=_RETRY_POLICY,
|
||||
)
|
||||
|
||||
# T32: Tag this workflow execution with activity metadata so runs are
|
||||
# filterable in the Temporal UI (requires ActivityId + ActivityName to be
|
||||
# registered as custom search attributes — see docs/runbook.md).
|
||||
workflow.upsert_search_attributes(
|
||||
TypedSearchAttributes([
|
||||
SearchAttributePair(_ACTIVITY_ID_KEY, activity_id),
|
||||
SearchAttributePair(_ACTIVITY_NAME_KEY, defn.get("name", "")),
|
||||
])
|
||||
)
|
||||
|
||||
# ── 2. Resolve context ────────────────────────────────────────────────
|
||||
context_snapshot: dict = await workflow.execute_activity(
|
||||
resolve_context,
|
||||
@@ -89,9 +105,14 @@ class RunActivityWorkflow:
|
||||
|
||||
# ── 4. Log the run ────────────────────────────────────────────────────
|
||||
# run_id is derived deterministically so log_run retries are idempotent.
|
||||
run_id = str(
|
||||
uuid.uuid5(uuid.NAMESPACE_URL, f"{activity_id}:{trigger_key}")
|
||||
)
|
||||
# For schedule-fired runs the trigger_key is the sentinel "scheduled";
|
||||
# each fire has a unique workflow_id (embeds ${firstScheduledTime}), so
|
||||
# we use the workflow_id as the dedup key instead.
|
||||
if trigger_key == SCHEDULED_TRIGGER_KEY:
|
||||
dedup_source = workflow.info().workflow_id
|
||||
else:
|
||||
dedup_source = f"{activity_id}:{trigger_key}"
|
||||
run_id = str(uuid.uuid5(uuid.NAMESPACE_URL, dedup_source))
|
||||
await workflow.execute_activity(
|
||||
log_run,
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user