feat(WP-0004): railiance deployment & service ops

- Dockerfile (multi-stage, uv-based, slim runtime)
- .dockerignore
- docker-compose.railiance.yml (Temporal + NATS + PG, no Elasticsearch)
- GET /health endpoint (db + temporal probes, 200/503)
- .env.example (complete env var reference)
- Makefile: migrate, sync-all, dev-up/down, railiance-up/down,
  start-worker, start-api, start-event-router, help targets;
  extracted sync-event-types Python to scripts/sync_event_types.py
- SIGTERM graceful shutdown in worker.py and event_router.py
- docs/runbook.md: Railiance deployment section

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-15 00:04:39 +02:00
parent 987cf5a75c
commit 2a8e6cfe7f
11 changed files with 830 additions and 25 deletions

View File

@@ -30,8 +30,9 @@ from datetime import datetime, timezone
from typing import Any
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from sqlalchemy import select
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
from temporalio.client import Client
@@ -275,6 +276,32 @@ async def trigger_definition(definition_id: uuid.UUID) -> dict[str, str]:
# T42: Curator gate — event type approval endpoint
@app.get("/health")
async def health() -> JSONResponse:
db_ok = False
temporal_ok = False
try:
async with _get_db()() as session:
await session.execute(text("SELECT 1"))
db_ok = True
except Exception:
pass
try:
await _get_temporal().describe_namespace(TEMPORAL_NAMESPACE)
temporal_ok = True
except Exception:
pass
status = "ok" if db_ok and temporal_ok else "degraded"
code = 200 if status == "ok" else 503
return JSONResponse(
{"status": status, "db": db_ok, "temporal": temporal_ok},
status_code=code,
)
@app.post("/event-types/{type_id}/approve", status_code=200)
async def approve_event_type(type_id: str) -> dict[str, str]:
"""Approve a pending event type, setting its status to 'active'.

View File

@@ -23,6 +23,7 @@ from __future__ import annotations
import asyncio
import logging
import os
import signal
import uuid
from datetime import datetime, timezone
from typing import Any
@@ -202,12 +203,19 @@ class EventRouter:
_CONSUMER_NAME,
)
loop = asyncio.get_running_loop()
stop = asyncio.Event()
loop.add_signal_handler(signal.SIGTERM, stop.set)
loop.add_signal_handler(signal.SIGINT, stop.set)
try:
await asyncio.Future() # run until cancelled
await stop.wait()
logger.info("Shutdown signal received — draining event router")
finally:
await sub.unsubscribe()
await self._nc.drain()
await engine.dispose()
logger.info("Event router stopped cleanly")
async def main() -> None:

View File

@@ -26,6 +26,7 @@ from __future__ import annotations
import asyncio
import logging
import os
import signal
from temporalio.client import Client
from temporalio.runtime import PrometheusConfig, Runtime, TelemetryConfig
@@ -102,12 +103,21 @@ async def run() -> None:
activities=[persist_task_instance],
)
loop = asyncio.get_running_loop()
stop = asyncio.Event()
loop.add_signal_handler(signal.SIGTERM, stop.set)
loop.add_signal_handler(signal.SIGINT, stop.set)
async with orchestrator_worker, task_worker:
print(
f"Workers running — queues: {ORCHESTRATOR_TASK_QUEUE!r}, "
f"{TASK_EXECUTION_TASK_QUEUE!r} (namespace={TEMPORAL_NAMESPACE!r})"
logger.info(
"Workers running — queues: %r, %r (namespace=%r)",
ORCHESTRATOR_TASK_QUEUE,
TASK_EXECUTION_TASK_QUEUE,
TEMPORAL_NAMESPACE,
)
await asyncio.Future() # run until cancelled
await stop.wait()
logger.info("Shutdown signal received — draining workers")
logger.info("Workers stopped cleanly")
if __name__ == "__main__":