From 821b5d6c892186489b3c9c1bf74add5b251c1f3a Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 21 Jun 2026 20:56:35 +0200 Subject: [PATCH] fix(STATE-WP-0064): parse consistency sweep stdout with skip prefixes Extract the JSON payload from mixed script output and document Railiance01 kubectl sync steps. Mark T02 done after cluster bridge and resolver canaries. --- api/services/consistency_sweep.py | 18 ++++++++++- docs/consistency-sweep-runbook.md | 32 ++++++++++++------- tests/test_consistency_sweep.py | 12 +++++++ ...4-statehub-consistency-sync-railiance01.md | 14 +++++++- 4 files changed, 62 insertions(+), 14 deletions(-) diff --git a/api/services/consistency_sweep.py b/api/services/consistency_sweep.py index b5f9e69..7bafaf5 100644 --- a/api/services/consistency_sweep.py +++ b/api/services/consistency_sweep.py @@ -50,11 +50,27 @@ def _parse_stderr(stderr: str) -> dict[str, list[str]]: } +def _extract_json_payload(text: str) -> Any: + stripped = text.strip() + if not stripped: + return [] + decoder = json.JSONDecoder() + for index, char in enumerate(stripped): + if char not in "{[": + continue + try: + payload, _end = decoder.raw_decode(stripped, index) + return payload + except json.JSONDecodeError: + continue + raise json.JSONDecodeError("No JSON payload found", stripped, 0) + + def _parse_stdout(stdout: str) -> list[ConsistencySweepRepoResult]: text = stdout.strip() if not text: return [] - payload = json.loads(text) + payload = _extract_json_payload(text) items = payload if isinstance(payload, list) else [payload] results: list[ConsistencySweepRepoResult] = [] for item in items: diff --git a/docs/consistency-sweep-runbook.md b/docs/consistency-sweep-runbook.md index 6b20686..ca226d3 100644 --- a/docs/consistency-sweep-runbook.md +++ b/docs/consistency-sweep-runbook.md @@ -33,23 +33,31 @@ service target). From the activity-core host, confirm the definition is synced and the Temporal schedule exists: -```bash -cd ~/activity-core -ACTIVITY_DEFINITION_DIRS=/home/worsch/the-custodian make sync-activity-definitions -``` - -Reconcile Temporal schedules (pick one): +Run on **Railiance01** (the laptop `.env` points at docker-compose hostnames +like `app-db` and will time out from WSL): ```bash -# Preferred when activity-core API is up (no worker restart) -curl -s -X POST 'http://localhost:8010/admin/sync?definitions=true&schedules=true' +export KUBECONFIG=~/.kube/config-hosteurope -# CLI fallback -ACTCORE_DB_URL=... TEMPORAL_HOST=... uv run python -m activity_core.sync_schedules +# 1. Apply runtime manifest when definitions change +kubectl apply -f ~/activity-core/k8s/railiance/20-runtime.yaml + +# 2. Sync definitions into Postgres +kubectl -n activity-core delete job actcore-sync --ignore-not-found +kubectl apply -f ~/activity-core/k8s/railiance/20-runtime.yaml +kubectl -n activity-core wait --for=condition=complete job/actcore-sync --timeout=180s + +# 3. Reconcile Temporal schedules +kubectl -n activity-core exec deploy/actcore-worker -- python -m activity_core.sync_schedules ``` -On Railiance01, use the in-cluster activity-core API URL and env from the -deployment instead of `localhost:8010`. +After changing application code, rebuild and import `activity-core:railiance01-prod` +per `activity-core/k8s/railiance/README.md`, then restart +`actcore-worker`, `actcore-api`, and `actcore-event-router`. + +Ensure `state-hub-railiance01` ops-bridge tunnel is `connected` before +cluster-triggered sweeps; the in-cluster bridge proxy allows up to 360s for +POST requests. Expected definition: diff --git a/tests/test_consistency_sweep.py b/tests/test_consistency_sweep.py index c6958d4..490c000 100644 --- a/tests/test_consistency_sweep.py +++ b/tests/test_consistency_sweep.py @@ -97,6 +97,18 @@ def test_parse_stderr_extracts_skip_lists(): } +def test_extract_json_payload_skips_human_readable_prefix_lines(): + stdout = ( + " CLEAN (skipped): quiet-repo\n" + " BUDGET EXHAUSTED after 30s (skipped): other-repo\n" + '{\n "repo_slug": "state-hub",\n "repo_path": "/home/worsch/state-hub",\n' + ' "result": "pass",\n "summary": {"fail": 0, "warn": 0, "info": 0},\n' + ' "fixes_applied": []\n}\n' + ) + payload = sweep_service._extract_json_payload(stdout) + assert payload["repo_slug"] == "state-hub" + + def test_parse_stdout_handles_single_and_batch_payloads(): single = json.dumps( { diff --git a/workplans/STATE-WP-0064-statehub-consistency-sync-railiance01.md b/workplans/STATE-WP-0064-statehub-consistency-sync-railiance01.md index 11314a5..f86f4d4 100644 --- a/workplans/STATE-WP-0064-statehub-consistency-sync-railiance01.md +++ b/workplans/STATE-WP-0064-statehub-consistency-sync-railiance01.md @@ -92,7 +92,7 @@ Done 2026-06-21: ```task id: STATE-WP-0064-T02 -status: todo +status: done priority: high state_hub_task_id: "2e9b5b66-a7b1-46a5-8e1f-22e6b5caeff6" ``` @@ -104,6 +104,18 @@ Trigger one manual ActivityRun. Confirm: - progress or activity-core run history shows success - no duplicate side-effects when local timer also fires (idempotent) +Done 2026-06-21: + +- Applied `20-runtime.yaml` on Railiance01; `actcore-sync` upserted definition + `7c4e9a12-8f3b-4d5e-9c6a-1b2d3e4f5a6b` (paused schedule). +- Rebuilt/imported `activity-core:railiance01-prod` with + `consistency_sweep_remote_all` resolver. +- Bridge proxy POST timeout raised to 360s (30s was aborting sweeps). +- Manual canaries: cluster POST via bridge (`exit_code 0`, progress event + `65d0bc12-…`) and worker resolver (`exit_code 0`, 1 repo @ 60s budget). +- Laptop `make sync-activity-definitions` is not valid against Railiance01 DB; + use kubectl `actcore-sync` job instead. + ## T3 — Parallel run and observability ```task