From 2a8e6cfe7fe5d84490c5befff37eb9e0e032784c Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 15 May 2026 00:04:39 +0200 Subject: [PATCH] feat(WP-0004): railiance deployment & service ops - Dockerfile (multi-stage, uv-based, slim runtime) - .dockerignore - docker-compose.railiance.yml (Temporal + NATS + PG, no Elasticsearch) - GET /health endpoint (db + temporal probes, 200/503) - .env.example (complete env var reference) - Makefile: migrate, sync-all, dev-up/down, railiance-up/down, start-worker, start-api, start-event-router, help targets; extracted sync-event-types Python to scripts/sync_event_types.py - SIGTERM graceful shutdown in worker.py and event_router.py - docs/runbook.md: Railiance deployment section Co-Authored-By: Claude Sonnet 4.6 --- .dockerignore | 7 + .env.example | 43 ++ Dockerfile | 19 + Makefile | 68 +++- docker-compose.railiance.yml | 175 +++++++++ docs/runbook.md | 72 ++++ scripts/sync_event_types.py | 21 + src/activity_core/api.py | 29 +- src/activity_core/event_router.py | 10 +- src/activity_core/worker.py | 18 +- workplans/custodian-WP-0004-railiance-ops.md | 393 +++++++++++++++++++ 11 files changed, 830 insertions(+), 25 deletions(-) create mode 100644 .dockerignore create mode 100644 .env.example create mode 100644 Dockerfile create mode 100644 docker-compose.railiance.yml create mode 100644 scripts/sync_event_types.py create mode 100644 workplans/custodian-WP-0004-railiance-ops.md diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..7e3c392 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +.venv/ +__pycache__/ +*.pyc +.git/ +tests/ +*.egg-info/ +.env diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..100dc2c --- /dev/null +++ b/.env.example @@ -0,0 +1,43 @@ +# ── Required ────────────────────────────────────────────────────────────────── +# PostgreSQL connection string for activity-core application data. +ACTCORE_DB_URL=postgresql+asyncpg://actcore:actcore@app-db:5432/actcore + +# ── Temporal ────────────────────────────────────────────────────────────────── +# Temporal frontend gRPC address. +TEMPORAL_HOST=temporal:7233 +# Temporal namespace (must exist before workers start). +TEMPORAL_NAMESPACE=default + +# ── NATS ────────────────────────────────────────────────────────────────────── +# NATS server URL. JetStream must be enabled (-js flag). +NATS_URL=nats://nats:4222 + +# ── Service integrations (gracefully degraded if unavailable) ───────────────── +# State Hub — used by the state-hub context adapter. Binds {} on failure. +STATE_HUB_URL=http://127.0.0.1:8000 +# Repo scoping — used by the repo-scoping context adapter. Binds {} on failure. +REPO_SCOPING_URL=http://127.0.0.1:8020 +# Issue Core — task emission backend. +ISSUE_CORE_URL=http://127.0.0.1:8010 +# Sink type: 'rest' (POST to issue-core) or 'null' (discard, for dry-run). +ISSUE_SINK_TYPE=rest + +# ── Activity definitions ─────────────────────────────────────────────────────── +# Colon-separated paths to additional activity-definitions/ directories. +# The local activity-definitions/ directory is always scanned. +ACTIVITY_DEFINITION_DIRS= + +# ── Observability ───────────────────────────────────────────────────────────── +# Prometheus metrics bind address (Temporal SDK metrics). +PROMETHEUS_BIND_ADDR=0.0.0.0:9090 + +# ── Security (webhook receiver) ─────────────────────────────────────────────── +# HMAC-SHA256 secret for Gitea webhook signature validation. +WEBHOOK_SECRET_GITEA= +# HMAC-SHA256 secret for GitHub webhook signature validation. +WEBHOOK_SECRET_GITHUB= + +# ── Curator gate ────────────────────────────────────────────────────────────── +# 'disabled': accepts active + pending event types (pending logged as warning). +# 'required': only active event types accepted; pending events are discarded. +ACTIVITY_CURATOR_GATE=disabled diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c4b2c04 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,19 @@ +# Stage 1 — install Python deps +FROM python:3.12-slim AS builder +RUN pip install uv --no-cache-dir +WORKDIR /app +COPY pyproject.toml uv.lock ./ +COPY src/ ./src/ +RUN uv sync --no-dev --frozen + +# Stage 2 — runtime image +FROM python:3.12-slim AS runtime +WORKDIR /app +COPY --from=builder /app/.venv /app/.venv +COPY --from=builder /app/src /app/src +COPY activity-definitions/ ./activity-definitions/ +COPY event-types/ ./event-types/ +COPY tasks/ ./tasks/ +ENV PATH="/app/.venv/bin:$PATH" +ENV PYTHONPATH="/app/src" +CMD ["python", "-m", "activity_core.worker"] diff --git a/Makefile b/Makefile index f32880a..7425731 100644 --- a/Makefile +++ b/Makefile @@ -1,24 +1,54 @@ -.PHONY: sync-event-types sync-activity-definitions test +-include .env +export -sync-activity-definitions: +.PHONY: sync-event-types sync-activity-definitions test migrate sync-all \ + dev-up dev-down railiance-up railiance-down \ + start-worker start-api start-event-router help + +sync-activity-definitions: ## Sync ActivityDefinition files into DB uv run python -m activity_core.sync_activity_definitions -sync-event-types: - uv run python -c " -import asyncio, os -from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine -from activity_core.event_type_registry import sync_event_types +sync-event-types: ## Sync event type YAML files into DB + uv run python scripts/sync_event_types.py -async def main(): - db_url = os.environ.get('ACTCORE_DB_URL', 'postgresql+asyncpg://actcore:actcore@localhost:5433/actcore') - engine = create_async_engine(db_url) - factory = async_sessionmaker(engine, expire_on_commit=False) - n = await sync_event_types(factory) - print(f'Synced {n} event types') - await engine.dispose() - -asyncio.run(main()) -" - -test: +test: ## Run test suite uv run pytest tests/ -v + +# ── Database ────────────────────────────────────────────────────────────────── + +migrate: ## Apply all pending Alembic migrations + uv run alembic upgrade head + +sync-all: sync-event-types sync-activity-definitions ## Sync event types and activity definitions + +# ── Infrastructure ───────────────────────────────────────────────────────────── + +dev-up: ## Start full dev stack (Temporal + PG + ES + NATS) + docker compose -f docker-compose.dev.yml up -d + +dev-down: ## Stop and remove dev stack containers + docker compose -f docker-compose.dev.yml down + +railiance-up: ## Build image and start full railiance stack (no Elasticsearch) + docker compose -f docker-compose.railiance.yml up -d --build + +railiance-down: ## Stop and remove railiance stack containers + docker compose -f docker-compose.railiance.yml down + +# ── Local dev processes ─────────────────────────────────────────────────────── + +start-worker: ## Start Temporal worker (reads env from .env if present) + uv run python -m activity_core.worker + +start-api: ## Start FastAPI server on :8010 with hot reload + uv run uvicorn activity_core.api:app --host 0.0.0.0 --port 8010 --reload + +start-event-router: ## Start NATS event router + uv run python -m activity_core.event_router + +# ── Help ────────────────────────────────────────────────────────────────────── + +help: ## Show this help message + @grep -E '^[a-zA-Z_-]+:.*?##' $(MAKEFILE_LIST) | \ + awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-24s\033[0m %s\n", $$1, $$2}' | \ + sort diff --git a/docker-compose.railiance.yml b/docker-compose.railiance.yml new file mode 100644 index 0000000..c16e9b8 --- /dev/null +++ b/docker-compose.railiance.yml @@ -0,0 +1,175 @@ +services: + + # ── Temporal persistence DB ────────────────────────────────────────────────── + temporal-db: + image: postgres:16 + environment: + POSTGRES_USER: temporal + POSTGRES_PASSWORD: temporal + POSTGRES_DB: temporal + volumes: + - temporal-db-data:/var/lib/postgresql/data + networks: + - actcore-net + healthcheck: + test: ["CMD-SHELL", "pg_isready -U temporal"] + interval: 5s + timeout: 5s + retries: 10 + + # ── Temporal server (PostgreSQL visibility, no Elasticsearch) ───────────────── + temporal: + image: temporalio/auto-setup:1.29.1 + depends_on: + temporal-db: + condition: service_healthy + environment: + DB: postgres12 + DB_PORT: 5432 + POSTGRES_USER: temporal + POSTGRES_PWD: temporal + POSTGRES_SEEDS: temporal-db + DYNAMIC_CONFIG_FILE_PATH: /etc/temporal/dynamicconfig.yaml + ENABLE_ES: "false" + VISIBILITY_DBNAME: temporal_visibility + TEMPORAL_ADDRESS: temporal:7233 + networks: + - actcore-net + healthcheck: + test: ["CMD-SHELL", "tctl --address temporal:7233 cluster health 2>&1 | grep -q SERVING"] + interval: 10s + timeout: 10s + retries: 20 + start_period: 30s + + # ── Temporal Web UI ─────────────────────────────────────────────────────────── + temporal-ui: + image: temporalio/ui:latest + depends_on: + temporal: + condition: service_healthy + environment: + TEMPORAL_ADDRESS: temporal:7233 + TEMPORAL_CORS_ORIGINS: http://localhost:8080 + ports: + - "8080:8080" + networks: + - actcore-net + + # ── NATS with JetStream ─────────────────────────────────────────────────────── + nats: + image: nats:2.10-alpine + command: ["-js", "-sd", "/data"] + volumes: + - nats-data:/data + ports: + - "4222:4222" + networks: + - actcore-net + healthcheck: + test: ["CMD-SHELL", "nats-server --help > /dev/null 2>&1 || wget -q -O- http://localhost:8222/healthz | grep -q ok"] + interval: 5s + timeout: 5s + retries: 10 + + # ── Application DB ──────────────────────────────────────────────────────────── + app-db: + image: postgres:16 + environment: + POSTGRES_USER: actcore + POSTGRES_PASSWORD: actcore + POSTGRES_DB: actcore + volumes: + - app-db-data:/var/lib/postgresql/data + networks: + - actcore-net + healthcheck: + test: ["CMD-SHELL", "pg_isready -U actcore"] + interval: 5s + timeout: 5s + retries: 10 + + # ── One-shot migration runner ───────────────────────────────────────────────── + actcore-migrate: + build: . + command: ["python", "-m", "alembic", "upgrade", "head"] + env_file: .env + depends_on: + app-db: + condition: service_healthy + networks: + - actcore-net + restart: "no" + + # ── Temporal worker ─────────────────────────────────────────────────────────── + actcore-worker: + build: . + command: ["python", "-m", "activity_core.worker"] + env_file: .env + depends_on: + temporal: + condition: service_healthy + app-db: + condition: service_healthy + nats: + condition: service_healthy + actcore-migrate: + condition: service_completed_successfully + ports: + - "9090:9090" + networks: + - actcore-net + restart: unless-stopped + + # ── REST API ────────────────────────────────────────────────────────────────── + actcore-api: + build: . + command: ["uvicorn", "activity_core.api:app", "--host", "0.0.0.0", "--port", "8010"] + env_file: .env + depends_on: + temporal: + condition: service_healthy + app-db: + condition: service_healthy + nats: + condition: service_healthy + actcore-migrate: + condition: service_completed_successfully + ports: + - "8010:8010" + networks: + - actcore-net + restart: unless-stopped + healthcheck: + test: ["CMD-SHELL", "curl -sf http://localhost:8010/health"] + interval: 10s + timeout: 5s + retries: 5 + start_period: 15s + + # ── Event Router ────────────────────────────────────────────────────────────── + actcore-event-router: + build: . + command: ["python", "-m", "activity_core.event_router"] + env_file: .env + depends_on: + temporal: + condition: service_healthy + app-db: + condition: service_healthy + nats: + condition: service_healthy + actcore-migrate: + condition: service_completed_successfully + networks: + - actcore-net + restart: unless-stopped + +volumes: + temporal-db-data: + app-db-data: + nats-data: + +networks: + actcore-net: + driver: bridge diff --git a/docs/runbook.md b/docs/runbook.md index 82f2c4b..a547196 100644 --- a/docs/runbook.md +++ b/docs/runbook.md @@ -213,6 +213,78 @@ uv run alembic history # show full migration history --- +## Railiance Deployment + +### Pre-requisites +- Docker ≥ 24 with Compose v2 (`docker compose` not `docker-compose`) +- ≥ 4 GB RAM available (Temporal server takes ~1 GB) +- Ports available: 4222 (NATS), 7233 (Temporal gRPC), 8010 (API), 8080 (Temporal UI), + 9090 (Prometheus metrics) + +### First-time setup + +```bash +# 1. Copy and edit the env file — fill in all secrets and URLs +cp .env.example .env + +# 2. Build the image and start all services +make railiance-up + +# 3. Wait for health (retry until 200) +curl -sf http://localhost:8010/health # → {"status":"ok","db":true,"temporal":true} + +# 4. Register Temporal search attributes (one-time per namespace) +docker exec actcore-temporal temporal operator search-attribute create \ + --name ActivityId --type Keyword \ + --name ActivityName --type Keyword \ + --address temporal:7233 + +# 5. Load event types and activity definitions +make sync-all +``` + +### Upgrade procedure + +```bash +git pull +make railiance-up # rebuilds image, restarts changed services +make migrate # apply any new migrations (safe to run when none pending) +curl -sf http://localhost:8010/health +``` + +### Health verification + +```bash +# API health (db + temporal probes) +curl -s http://localhost:8010/health | python3 -m json.tool + +# Temporal UI +open http://localhost:8080 + +# Prometheus metrics +curl -s http://localhost:9090/metrics | head -20 +``` + +### Common ops + +```bash +# Follow logs for one service +docker compose -f docker-compose.railiance.yml logs -f actcore-worker + +# Restart one service without bringing down others +docker compose -f docker-compose.railiance.yml restart actcore-api + +# Re-run migrations manually +docker compose -f docker-compose.railiance.yml run --rm actcore-migrate + +# Wipe and reset (DESTRUCTIVE — deletes all volumes including DB data) +make railiance-down +docker volume rm activity-core_temporal-db-data activity-core_app-db-data activity-core_nats-data +make railiance-up +``` + +--- + ## Wipe and restart dev stack ```bash diff --git a/scripts/sync_event_types.py b/scripts/sync_event_types.py new file mode 100644 index 0000000..7047971 --- /dev/null +++ b/scripts/sync_event_types.py @@ -0,0 +1,21 @@ +import asyncio +import os + +from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine + +from activity_core.event_type_registry import sync_event_types + + +async def main() -> None: + db_url = os.environ.get( + "ACTCORE_DB_URL", + "postgresql+asyncpg://actcore:actcore@localhost:5433/actcore", + ) + engine = create_async_engine(db_url) + factory = async_sessionmaker(engine, expire_on_commit=False) + n = await sync_event_types(factory) + print(f"Synced {n} event types") + await engine.dispose() + + +asyncio.run(main()) diff --git a/src/activity_core/api.py b/src/activity_core/api.py index e88cbb8..cd3be69 100644 --- a/src/activity_core/api.py +++ b/src/activity_core/api.py @@ -30,8 +30,9 @@ from datetime import datetime, timezone from typing import Any from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse from pydantic import BaseModel -from sqlalchemy import select +from sqlalchemy import select, text from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine from temporalio.client import Client @@ -275,6 +276,32 @@ async def trigger_definition(definition_id: uuid.UUID) -> dict[str, str]: # T42: Curator gate — event type approval endpoint +@app.get("/health") +async def health() -> JSONResponse: + db_ok = False + temporal_ok = False + + try: + async with _get_db()() as session: + await session.execute(text("SELECT 1")) + db_ok = True + except Exception: + pass + + try: + await _get_temporal().describe_namespace(TEMPORAL_NAMESPACE) + temporal_ok = True + except Exception: + pass + + status = "ok" if db_ok and temporal_ok else "degraded" + code = 200 if status == "ok" else 503 + return JSONResponse( + {"status": status, "db": db_ok, "temporal": temporal_ok}, + status_code=code, + ) + + @app.post("/event-types/{type_id}/approve", status_code=200) async def approve_event_type(type_id: str) -> dict[str, str]: """Approve a pending event type, setting its status to 'active'. diff --git a/src/activity_core/event_router.py b/src/activity_core/event_router.py index b54aac7..357916b 100644 --- a/src/activity_core/event_router.py +++ b/src/activity_core/event_router.py @@ -23,6 +23,7 @@ from __future__ import annotations import asyncio import logging import os +import signal import uuid from datetime import datetime, timezone from typing import Any @@ -202,12 +203,19 @@ class EventRouter: _CONSUMER_NAME, ) + loop = asyncio.get_running_loop() + stop = asyncio.Event() + loop.add_signal_handler(signal.SIGTERM, stop.set) + loop.add_signal_handler(signal.SIGINT, stop.set) + try: - await asyncio.Future() # run until cancelled + await stop.wait() + logger.info("Shutdown signal received — draining event router") finally: await sub.unsubscribe() await self._nc.drain() await engine.dispose() + logger.info("Event router stopped cleanly") async def main() -> None: diff --git a/src/activity_core/worker.py b/src/activity_core/worker.py index 58a62fd..37976f7 100644 --- a/src/activity_core/worker.py +++ b/src/activity_core/worker.py @@ -26,6 +26,7 @@ from __future__ import annotations import asyncio import logging import os +import signal from temporalio.client import Client from temporalio.runtime import PrometheusConfig, Runtime, TelemetryConfig @@ -102,12 +103,21 @@ async def run() -> None: activities=[persist_task_instance], ) + loop = asyncio.get_running_loop() + stop = asyncio.Event() + loop.add_signal_handler(signal.SIGTERM, stop.set) + loop.add_signal_handler(signal.SIGINT, stop.set) + async with orchestrator_worker, task_worker: - print( - f"Workers running — queues: {ORCHESTRATOR_TASK_QUEUE!r}, " - f"{TASK_EXECUTION_TASK_QUEUE!r} (namespace={TEMPORAL_NAMESPACE!r})" + logger.info( + "Workers running — queues: %r, %r (namespace=%r)", + ORCHESTRATOR_TASK_QUEUE, + TASK_EXECUTION_TASK_QUEUE, + TEMPORAL_NAMESPACE, ) - await asyncio.Future() # run until cancelled + await stop.wait() + logger.info("Shutdown signal received — draining workers") + logger.info("Workers stopped cleanly") if __name__ == "__main__": diff --git a/workplans/custodian-WP-0004-railiance-ops.md b/workplans/custodian-WP-0004-railiance-ops.md new file mode 100644 index 0000000..da11e24 --- /dev/null +++ b/workplans/custodian-WP-0004-railiance-ops.md @@ -0,0 +1,393 @@ +--- +id: custodian-WP-0004 +type: workplan +domain: custodian +repo: activity-core +status: active +state_hub_workstream_id: 759b1255-aa78-42b5-8fab-c5fcf168f5f4 +tasks: + - id: T58 + title: Dockerfile (multi-stage, uv-based) + status: todo + priority: high + state_hub_task_id: 2a81f9ba-47cb-480f-a5d3-af74fe238485 + - id: T59 + title: docker-compose.railiance.yml (full stack, no Elasticsearch) + status: todo + priority: high + state_hub_task_id: 67981de4-b766-4c47-9985-9573297ac464 + - id: T60 + title: "GET /health endpoint" + status: todo + priority: high + state_hub_task_id: 7c7a1617-29e9-4ea5-a675-fd75e325c451 + - id: T61 + title: .env.example — complete env var reference + status: todo + priority: medium + state_hub_task_id: 2a2a8d02-19dc-4b09-8792-f855ed48388a + - id: T62 + title: Makefile ops targets + status: todo + priority: medium + state_hub_task_id: 55ac37b5-5606-42e5-bf11-e8dae3a188b7 + - id: T63 + title: SIGTERM graceful shutdown (worker + event_router) + status: todo + priority: medium + state_hub_task_id: 65b3229b-5f03-450d-a263-f2c205be3d28 + - id: T64 + title: docs/runbook.md — railiance deployment section + status: todo + priority: medium + state_hub_task_id: 83dcd765-5715-499f-be23-f728d0261cfb +created: "2026-05-14" +--- + +# activity-core WP-0004 — Railiance Deployment & Service Ops + +**Hub workstream:** `759b1255-aa78-42b5-8fab-c5fcf168f5f4` +**Goal:** Package activity-core as a fully standalone deployable service, runnable +on railiance with no imports from or requirements against other custodian repos. + +## Context + +WP-0001 through WP-0003 built the complete event-bridge implementation. The +service is functionally complete. This workplan makes it operationally deployable: +containerised, healthchecked, gracefully shutting down, and documented for railiance. + +Runtime dependencies are infrastructure only (Temporal, PostgreSQL, NATS). The +optional soft dependencies (state-hub, repo-scoping, issue-core) are already +gracefully degraded in the code — they bind `{}` on failure and never abort a run. + +## Existing assets (no rework needed) + +- `docker-compose.dev.yml` — full dev stack (Temporal + ES + PG + NATS); keep as-is +- `worker.py`, `api.py`, `event_router.py` — entry points exist and are functional +- `migrations/`, `alembic.ini` — schema management is in place +- `docs/runbook.md` — dev quick-start section exists; extend for railiance + +## Build Order + +``` +T58 (Dockerfile) → T59 (railiance compose — depends on image) +T60, T61, T62 — parallel, no deps +T63 — independent, improves T59 (clean shutdown in compose) +T64 — last, documents T58-T63 +``` + +--- + +## T58: Dockerfile (multi-stage, uv-based) + +**File:** `Dockerfile` + +Two-stage build to keep the runtime image lean. + +```dockerfile +# Stage 1 — install Python deps +FROM python:3.12-slim AS builder +RUN pip install uv --no-cache-dir +WORKDIR /app +COPY pyproject.toml uv.lock ./ +COPY src/ ./src/ +RUN uv sync --no-dev --frozen + +# Stage 2 — runtime image +FROM python:3.12-slim AS runtime +WORKDIR /app +COPY --from=builder /app/.venv /app/.venv +COPY --from=builder /app/src /app/src +# Include definition files that ship with the repo +COPY activity-definitions/ ./activity-definitions/ +COPY event-types/ ./event-types/ +COPY tasks/ ./tasks/ +ENV PATH="/app/.venv/bin:$PATH" +ENV PYTHONPATH="/app/src" +CMD ["python", "-m", "activity_core.worker"] +``` + +Also add `.dockerignore`: +``` +.venv/ +__pycache__/ +*.pyc +.git/ +tests/ +*.egg-info/ +.env +``` + +The three processes share the same image; docker-compose overrides `command:`: +- worker: default (`python -m activity_core.worker`) +- api: `uvicorn activity_core.api:app --host 0.0.0.0 --port 8010` +- event-router: `python -m activity_core.event_router` + +--- + +## T59: docker-compose.railiance.yml (full stack, no Elasticsearch) + +**File:** `docker-compose.railiance.yml` + +Self-contained production stack. Uses PostgreSQL-based Temporal visibility (no +Elasticsearch required). All activity-core services read env from `env_file: .env`. + +Services: +- `temporal-db` — postgres:16 (Temporal schema + visibility) +- `temporal` — temporalio/auto-setup:1.29.1 with `ENABLE_ES: "false"`, + `DB: postgres12`, `VISIBILITY_DBNAME: temporal_visibility` +- `temporal-ui` — temporalio/ui on port 8080 +- `nats` — nats:2.10-alpine with `-js` and persistent volume for JetStream state +- `app-db` — postgres:16 (activity-core application data) +- `actcore-migrate` — one-shot service: `build: .`, `command: alembic upgrade head`, + `restart: no`, runs migrations then exits; other actcore services depend on it +- `actcore-worker` — worker process, metrics port 9090 +- `actcore-api` — API server, port 8010, with healthcheck against `/health` +- `actcore-event-router` — event router process + +All three actcore processes share one image (`build: .`) and depend on: +- `temporal` (condition: service_healthy) +- `app-db` (condition: service_healthy) +- `nats` (condition: service_healthy) +- `actcore-migrate` (condition: service_completed_successfully) + +Persistent volumes: `temporal-db-data`, `app-db-data`, `nats-data`. + +Network: `actcore-net` (bridge). + +--- + +## T60: GET /health endpoint + +**File:** `src/activity_core/api.py` (add route) + +```python +from sqlalchemy import text + +@app.get("/health") +async def health() -> JSONResponse: + db_ok = False + temporal_ok = False + + try: + async with _get_db()() as session: + await session.execute(text("SELECT 1")) + db_ok = True + except Exception: + pass + + try: + await _get_temporal().describe_namespace(TEMPORAL_NAMESPACE) + temporal_ok = True + except Exception: + pass + + status = "ok" if db_ok and temporal_ok else "degraded" + code = 200 if status == "ok" else 503 + return JSONResponse( + {"status": status, "db": db_ok, "temporal": temporal_ok}, + status_code=code, + ) +``` + +No authentication. Used by: +- `docker-compose healthcheck: test: ["CMD-SHELL", "curl -sf http://localhost:8010/health"]` +- Railiance monitoring (external HTTP probe) + +--- + +## T61: .env.example — complete env var reference + +**File:** `.env.example` + +```bash +# ── Required ────────────────────────────────────────────────────────────────── +# PostgreSQL connection string for activity-core application data. +ACTCORE_DB_URL=postgresql+asyncpg://actcore:actcore@app-db:5432/actcore + +# ── Temporal ────────────────────────────────────────────────────────────────── +# Temporal frontend gRPC address. +TEMPORAL_HOST=temporal:7233 +# Temporal namespace (must exist before workers start). +TEMPORAL_NAMESPACE=default + +# ── NATS ────────────────────────────────────────────────────────────────────── +# NATS server URL. JetStream must be enabled (-js flag). +NATS_URL=nats://nats:4222 + +# ── Service integrations (gracefully degraded if unavailable) ───────────────── +# State Hub — used by the state-hub context adapter. Binds {} on failure. +STATE_HUB_URL=http://127.0.0.1:8000 +# Repo scoping — used by the repo-scoping context adapter. Binds {} on failure. +REPO_SCOPING_URL=http://127.0.0.1:8020 +# Issue Core — task emission backend. +ISSUE_CORE_URL=http://127.0.0.1:8010 +# Sink type: 'rest' (POST to issue-core) or 'null' (discard, for dry-run). +ISSUE_SINK_TYPE=rest + +# ── Activity definitions ─────────────────────────────────────────────────────── +# Colon-separated paths to additional activity-definitions/ directories. +# The local activity-definitions/ directory is always scanned. +ACTIVITY_DEFINITION_DIRS= + +# ── Observability ───────────────────────────────────────────────────────────── +# Prometheus metrics bind address (Temporal SDK metrics). +PROMETHEUS_BIND_ADDR=0.0.0.0:9090 + +# ── Security (webhook receiver) ─────────────────────────────────────────────── +# HMAC-SHA256 secret for Gitea webhook signature validation. +WEBHOOK_SECRET_GITEA= +# HMAC-SHA256 secret for GitHub webhook signature validation. +WEBHOOK_SECRET_GITHUB= + +# ── Curator gate ────────────────────────────────────────────────────────────── +# 'disabled': accepts active + pending event types (pending logged as warning). +# 'required': only active event types accepted; pending events are discarded. +ACTIVITY_CURATOR_GATE=disabled +``` + +--- + +## T62: Makefile ops targets + +**File:** `Makefile` (extend) + +New targets to add: + +```makefile +# ── Infrastructure ──────────────────────────────────────────────────────────── +dev-up: ## Start full dev stack (Temporal + PG + NATS) +dev-down: ## Stop and remove dev stack containers +railiance-up: ## Start full railiance stack (builds image first) +railiance-down: ## Stop and remove railiance stack containers + +# ── Database ────────────────────────────────────────────────────────────────── +migrate: ## Apply all pending Alembic migrations +sync-all: ## Sync event types and activity definitions (runs both) + +# ── Local dev processes ─────────────────────────────────────────────────────── +start-worker: ## Start Temporal worker (uses ACTCORE_DB_URL from env) +start-api: ## Start FastAPI server on :8010 (hot reload) +start-event-router: ## Start NATS event router + +# ── Help ────────────────────────────────────────────────────────────────────── +help: ## Show this help message +``` + +The `help` target uses `grep -E '^[a-zA-Z_-]+:.*?##' Makefile` to extract the +`## description` comments and format them as a table. Makes `make help` the entry +point for operators. + +`start-*` targets load `.env` if it exists (`-include .env; export`) so developers +can run locally without setting env vars manually. + +--- + +## T63: SIGTERM graceful shutdown (worker + event_router) + +**Files:** `src/activity_core/worker.py`, `src/activity_core/event_router.py` + +Replace the `await asyncio.Future()` pattern (which blocks forever, ignoring +SIGTERM) with a signal-aware stop event: + +```python +import signal + +async def run() -> None: + ... + loop = asyncio.get_running_loop() + stop = asyncio.Event() + loop.add_signal_handler(signal.SIGTERM, stop.set) + loop.add_signal_handler(signal.SIGINT, stop.set) + + async with orchestrator_worker, task_worker: # (worker.py) + logger.info("Workers running — waiting for shutdown signal") + await stop.wait() + logger.info("Shutdown signal received — draining workers") + logger.info("Workers stopped cleanly") +``` + +For `event_router.py`, the same pattern applies: `stop.wait()` replaces the +infinite future; on signal, the NATS consumer is unsubscribed before exiting. + +This ensures `docker stop` (SIGTERM → 10-second grace period → SIGKILL) completes +within the grace window by draining in-flight Temporal tasks and NATS messages +before the process exits. + +--- + +## T64: docs/runbook.md — Railiance Deployment section + +**File:** `docs/runbook.md` (extend with new section) + +Add after the existing Dev environment section: + +```markdown +## Railiance Deployment + +### Pre-requisites +- Docker ≥ 24 with Compose v2 (`docker compose` not `docker-compose`) +- ≥ 4 GB RAM available (Temporal server takes ~1 GB) +- Ports available: 7233 (Temporal gRPC), 8010 (API), 8080 (Temporal UI), + 9090 (Prometheus metrics) + +### First-time setup +1. `cp .env.example .env` — edit all values, especially secrets +2. `make railiance-up` — builds image and starts all services +3. Wait for health: `curl -sf http://localhost:8010/health` → `{"status":"ok",...}` +4. Register Temporal search attributes (one-time per namespace): + `docker exec actcore-temporal temporal operator search-attribute create \` + ` --name ActivityId --type Keyword --name ActivityName --type Keyword \` + ` --address temporal:7233` +5. `make sync-all` — load event types and activity definitions + +### Upgrade procedure +1. `git pull` +2. `make railiance-up` — rebuilds image and restarts changed services +3. `make migrate` — apply any new migrations (safe to run even if none pending) +4. `curl -sf http://localhost:8010/health` — verify health + +### Health verification +- API health: `curl -s http://localhost:8010/health | python3 -m json.tool` +- Temporal UI: http://localhost:8080 +- Prometheus metrics: http://localhost:9090/metrics + +### Common ops +- View logs: `docker compose -f docker-compose.railiance.yml logs -f actcore-worker` +- Restart one service: `docker compose -f docker-compose.railiance.yml restart actcore-api` +- Wipe and reset (destructive): `make railiance-down && docker volume rm ...` +``` + +--- + +## Completion Criteria + +1. `docker build -t activity-core .` succeeds and image is < 400 MB +2. `make railiance-up` starts all 8 services; all reach healthy state +3. `curl http://localhost:8010/health` returns `{"status":"ok",...}` with HTTP 200 +4. `docker stop actcore-worker` causes graceful drain (no SIGKILL within 10s) +5. `make help` prints a clean table of all targets with descriptions +6. `.env.example` covers every env var used anywhere in the codebase + +## New Files Produced + +| Path | Task | +|---|---| +| `Dockerfile` | T58 | +| `.dockerignore` | T58 | +| `docker-compose.railiance.yml` | T59 | +| `.env.example` | T61 | + +## Modified Files + +| Path | Task | Change | +|---|---|---| +| `src/activity_core/api.py` | T60 | Add `/health` route | +| `Makefile` | T62 | Add ops targets | +| `src/activity_core/worker.py` | T63 | SIGTERM handler | +| `src/activity_core/event_router.py` | T63 | SIGTERM handler | +| `docs/runbook.md` | T64 | Railiance deployment section | + +## Change History + +- v1.0 (2026-05-14): Initial workplan.