From 39ed5459b902054c97b641a3c93cacd5f8c6c954 Mon Sep 17 00:00:00 2001 From: tegwick Date: Mon, 22 Jun 2026 01:20:59 +0200 Subject: [PATCH] finish(STATE-WP-0064): cut over scheduler and split sweep errors from failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit STATE-WP-0064 cutover (state-hub only): - Retire local custodian-sync.timer; archive units under infra/systemd/archived/ - Mark workplan finished; update infra/README, cron-migration, runbook, AGENTS.md - Point activity-core-delegation at the consistency-sweep runbook Consistency engine — automation error vs assessment failure: - C-00 is an automation error; C-01..C-23 assessment failures are recorded for follow-up but no longer fail --remote --all scheduled sweeps (exit 0) - Skip workplans/README.md in the workplan glob (human index, not a workplan) - Progress events and compare script expose automation_error and assessment_failures separately from exit_code --- AGENTS.md | 5 + api/schemas/consistency_sweep.py | 2 + api/services/consistency_sweep.py | 19 +++- docs/activity-core-delegation.md | 4 +- docs/consistency-sweep-runbook.md | 49 ++++----- docs/cron-migration.md | 28 +++-- infra/README.md | 101 +++++------------- infra/systemd/archived/README.md | 16 +++ .../{ => archived}/custodian-sync.service | 0 .../{ => archived}/custodian-sync.timer | 0 scripts/compare_consistency_sweep_parallel.py | 8 +- scripts/consistency_check.py | 95 +++++++++++++--- tests/test_consistency_check.py | 8 ++ ...4-statehub-consistency-sync-railiance01.md | 66 +++++------- 14 files changed, 221 insertions(+), 180 deletions(-) create mode 100644 infra/systemd/archived/README.md rename infra/systemd/{ => archived}/custodian-sync.service (100%) rename infra/systemd/{ => archived}/custodian-sync.timer (100%) diff --git a/AGENTS.md b/AGENTS.md index 8ebc54d..50d4262 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,6 +4,11 @@ **Purpose:** Standalone State Hub service repository extracted from the-custodian/state-hub. Owns the FastAPI API, MCP server, dashboard, migrations, consistency tooling, and operational docs. +**Periodic consistency sync:** The 15-minute workplan↔DB sweep is scheduled on +activity-core (Railiance01), not a local timer. Execution still runs on this +workstation via the bridge tunnel. Runbook: +[`docs/consistency-sweep-runbook.md`](docs/consistency-sweep-runbook.md). + **Domain:** custodian **Repo slug:** state-hub **Topic ID:** `cee7bedf-2b48-46ef-8601-006474f2ad7a` diff --git a/api/schemas/consistency_sweep.py b/api/schemas/consistency_sweep.py index 083995c..cdda931 100644 --- a/api/schemas/consistency_sweep.py +++ b/api/schemas/consistency_sweep.py @@ -8,6 +8,7 @@ from pydantic import BaseModel, Field class ConsistencySweepIssueSummary(BaseModel): fail: int = 0 + automation_error: int = 0 warn: int = 0 info: int = 0 @@ -39,6 +40,7 @@ class ConsistencySweepRemoteAllRun(BaseModel): max_seconds: int source: str exit_code: int + automation_error: bool = False lock_skipped: bool repos_processed: list[ConsistencySweepRepoResult] = Field(default_factory=list) skipped_clean: list[str] = Field(default_factory=list) diff --git a/api/services/consistency_sweep.py b/api/services/consistency_sweep.py index 1943e36..3dbcb4e 100644 --- a/api/services/consistency_sweep.py +++ b/api/services/consistency_sweep.py @@ -83,6 +83,7 @@ def _parse_stdout(stdout: str) -> list[ConsistencySweepRepoResult]: result=str(item.get("result") or "pass"), summary=ConsistencySweepIssueSummary( fail=int(summary.get("fail", 0)), + automation_error=int(summary.get("automation_error", 0)), warn=int(summary.get("warn", 0)), info=int(summary.get("info", 0)), ), @@ -121,6 +122,7 @@ async def run_remote_all_sweep( stderr_meta = _parse_stderr(result.stderr) repos_processed = [] if lock_skipped else _parse_stdout(result.stdout) + automation_error = result.returncode != 0 and not lock_skipped progress_event_id = await _log_sweep_progress( session, started_at=started_at, @@ -128,6 +130,7 @@ async def run_remote_all_sweep( max_seconds=max_seconds, source=source, exit_code=result.returncode, + automation_error=automation_error, lock_skipped=lock_skipped, repos_processed=repos_processed, **stderr_meta, @@ -138,6 +141,7 @@ async def run_remote_all_sweep( max_seconds=max_seconds, source=source, exit_code=result.returncode, + automation_error=automation_error, lock_skipped=lock_skipped, repos_processed=repos_processed, skipped_clean=stderr_meta["skipped_clean"], @@ -155,6 +159,7 @@ async def _log_sweep_progress( max_seconds: int, source: str, exit_code: int, + automation_error: bool, lock_skipped: bool, repos_processed: list[ConsistencySweepRepoResult], skipped_clean: list[str], @@ -162,16 +167,23 @@ async def _log_sweep_progress( skipped_budget: list[str], ) -> uuid.UUID: processed_count = len(repos_processed) - fail_count = sum(1 for repo in repos_processed if repo.result == "fail") + error_count = sum(1 for repo in repos_processed if repo.result == "error") + assessment_fail_count = sum(1 for repo in repos_processed if repo.result == "fail") warn_count = sum(1 for repo in repos_processed if repo.result == "warn") if lock_skipped: summary = "State Hub consistency sweep skipped: prior remote-all run still active" + elif automation_error: + summary = ( + "State Hub consistency sweep automation error: " + f"exit_code={exit_code}, {processed_count} repos partially processed" + ) else: summary = ( "State Hub consistency sweep completed: " f"{processed_count} processed, {len(skipped_clean)} clean, " f"{len(skipped_missing)} missing, {len(skipped_budget)} budget-skipped, " - f"{fail_count} failed, {warn_count} warned" + f"{assessment_fail_count} assessment-fail, {error_count} automation-error, " + f"{warn_count} warned" ) event = ProgressEvent( event_type="consistency_sweep_remote_all", @@ -182,6 +194,9 @@ async def _log_sweep_progress( "max_seconds": max_seconds, "source": source, "exit_code": exit_code, + "automation_error": automation_error, + "assessment_failures": assessment_fail_count, + "automation_errors": error_count, "lock_skipped": lock_skipped, "repos_processed": [item.model_dump(mode="json") for item in repos_processed], "skipped_clean": skipped_clean, diff --git a/docs/activity-core-delegation.md b/docs/activity-core-delegation.md index 89b5196..960b2db 100644 --- a/docs/activity-core-delegation.md +++ b/docs/activity-core-delegation.md @@ -84,7 +84,9 @@ unset. the rule lives in activity-core. See [`docs/cron-migration.md`](cron-migration.md) for the -ActivityDefinition drafts and cutover plan. +ActivityDefinition drafts and cutover plan. The consistency sweep schedule +is live on Railiance01 — operator runbook: +[`docs/consistency-sweep-runbook.md`](consistency-sweep-runbook.md). ## What must never happen diff --git a/docs/consistency-sweep-runbook.md b/docs/consistency-sweep-runbook.md index ca93790..73f85c9 100644 --- a/docs/consistency-sweep-runbook.md +++ b/docs/consistency-sweep-runbook.md @@ -3,16 +3,16 @@ ## Purpose This runbook answers whether the 15-minute State Hub consistency sync ran -without relying on the local `custodian-sync.timer`. +without relying on the local `custodian-sync.timer` (retired 2026-06-21). -The intended steady state after `STATE-WP-0064` cutover is: +**Steady state** (`STATE-WP-0064` cutover complete): - activity-core on Railiance01 owns the `*/15 * * * *` UTC schedule and ActivityRun audit trail. - State Hub on the workstation owns `scripts/consistency_check.py`, lock semantics, reconciliation, and the `consistency_sweep_remote_all` progress event. -- The local systemd timer is disabled after the parallel week passes. +- The local systemd timer is **disabled**; cluster is the sole scheduler. ## API Surface @@ -65,7 +65,7 @@ Expected definition: - trigger: `*/15 * * * *` - timezone: `UTC` - misfire policy: `skip` -- enabled: `true` during parallel week (T03); local timer retired after T04 +- enabled: `true` ## Progress Event Check @@ -78,14 +78,17 @@ curl -s "http://127.0.0.1:8000/progress/?event_type=consistency_sweep_remote_all Healthy evidence includes: +- `detail.source: activity-core` on scheduled runs - `lock_skipped: false` on normal runs - `repos_processed` entries only for repos that needed action - `skipped_clean`, `skipped_missing`, and `skipped_budget` metadata when applicable -- `exit_code: 0` for warn-only remote-all sweeps +- `exit_code: 0` when automation completed (assessment failures are OK) +- `automation_error: true` only for infrastructure faults (API down, C-00, etc.) +- `assessment_failures` counts repos with hygiene gaps (C-01..C-23) for follow-up -A `lock_skipped: true` response is normal when the local timer and the -cluster schedule overlap during the parallel week. +A `lock_skipped: true` response is normal when a sweep is already in flight. +Assessment failures do not fail the scheduler; automation errors do. ## ActivityRun Check @@ -106,40 +109,26 @@ limit 5; ## Manual Canary -Before enabling the cluster schedule: +Before enabling or after changing the cluster schedule: 1. Confirm `state-hub-railiance01` tunnel health from ops-bridge. 2. Trigger one manual ActivityRun or POST the API through the bridge URL. 3. Verify the progress event and ActivityRun context snapshot. -4. Confirm idempotence when the local timer also fires (lock skip is OK). -## Parallel week observability (T03) +## Observability -Both runners call the same API and tag progress events with `detail.source`: - -| Source | Runner | -|--------|--------| -| `local-timer` | `custodian-sync.timer` on the workstation | -| `activity-core` | Railiance01 Temporal schedule | - -Summarise evidence: +Summarise recent sweep events by source: ```bash cd ~/state-hub uv run python scripts/compare_consistency_sweep_parallel.py --since-hours 24 ``` -Expect some `lock_skipped: true` events when both schedules overlap — that is -healthy idempotence, not duplicate work. +After cutover, expect only `activity-core` (and manual) sources — no new +`local-timer` events. -Parallel window: **2026-06-21 → 2026-06-28** (review before T04 cutover). +## Local fallback (emergency only) -## Cutover - -After one parallel week (`STATE-WP-0064-T03`): - -```bash -systemctl --user disable --now custodian-sync.timer -``` - -The cluster definition stays enabled; disable only the local timer. \ No newline at end of file +If cluster scheduling is broken, temporarily re-enable the archived systemd +units per [`infra/systemd/archived/README.md`](../infra/systemd/archived/README.md). +Disable again once cluster scheduling is restored. \ No newline at end of file diff --git a/docs/cron-migration.md b/docs/cron-migration.md index d45ffd3..11801ce 100644 --- a/docs/cron-migration.md +++ b/docs/cron-migration.md @@ -1,9 +1,8 @@ # State Hub Cron → activity-core ActivityDefinition Migration -> CUST-WP-0040 T04. **Partially implemented** as of `STATE-WP-0064`. -> The consistency sweep API surface and ActivityDefinition are landed; -> cluster cutover still requires manual canary, parallel week, and local -> timer retirement. +> CUST-WP-0040 T04. **Consistency sweep cut over** as of `STATE-WP-0064` +> (2026-06-21). Scheduling is on activity-core (Railiance01); the local +> `custodian-sync.timer` is retired. Stale-task cleanup (B) is still pending. The state hub currently runs two recurring maintenance jobs and one per-repo event hook. Once activity-core is ready, each becomes an @@ -16,7 +15,7 @@ keeps the underlying scripts; only the *scheduling* moves. | # | Source | Trigger today | Script invoked | What it does | | - | ------------------- | -------------------------------------------------------- | -------------------------------------------------------- | -------------------------------------------------------------------------------------------------- | -| 1 | systemd user timer | every 15 min | `scripts/consistency_check.py --remote --all` | Pull every registered repo, reconcile workplan files ↔ DB, run C-15 writeback + C-16 pull gate | +| 1 | activity-core cron | every 15 min (Railiance01) | `POST /consistency/sweep/remote-all` → `consistency_check.py --remote --all` | Pull every registered repo, reconcile workplan files ↔ DB, run C-15 writeback + C-16 pull gate | | 2 | manual / daily cron | `make cleanup-stale` (suggested `0 3 * * *`) | `scripts/cleanup_stale_tasks.py` | Cancel tasks still open in finished/archived workstreams; emits `org.statehub.task.stale` | | 3 | git post-commit | every commit in a registered repo | `make fix-consistency REPO=` | Per-repo workplan ↔ DB sync immediately after a commit | @@ -40,7 +39,7 @@ run them on a schedule. ### A. `state-hub-consistency-sweep` (implemented) Landed in `the-custodian/activity-definitions/state-hub-consistency-sweep.md` -with `enabled: false` until canary and cutover. +with `enabled: true` on Railiance01 since 2026-06-21 cutover. Invocation path (matches the hourly RecentlyOnScope pattern): @@ -56,11 +55,10 @@ checkout from the cluster. Operator runbook: [`docs/consistency-sweep-runbook.md`](consistency-sweep-runbook.md). Notes: -- Replaces the `custodian-sync.service` + `custodian-sync.timer` pair - after parallel week and cutover. +- Replaced the `custodian-sync.service` + `custodian-sync.timer` pair + (local timer disabled 2026-06-21; units archived under `infra/systemd/archived/`). - Lock semantics (`/tmp/custodian-consistency-remote-all.lock`) stay in the script — activity-core just sets the cadence. -- Local timer retirement is tracked in `STATE-WP-0064-T04`. ### B. `state-hub-stale-task-cleanup` @@ -130,8 +128,8 @@ Still optional for B and future splits: | activity-core shell instruction kind with on_failure semantics | activity-core | activity-core/`src/...` | | state-hub adapter exposing `state-hub.health` as a context source | activity-core | activity-core/adapters/ | -Until B lands and A is cut over, the state hub continues to schedule the -consistency sweep via the local systemd timer. +A is cut over. Until B lands, stale-task cleanup remains on-demand via +`make cleanup-stale` (or a manual daily cron). --- @@ -142,11 +140,9 @@ consistency sweep via the local systemd timer. same DB / NATS effects as the current cron entries. 3. Run both in parallel for one week (cron + ActivityDefinition). The scripts are idempotent — duplicate runs are no-ops on a clean state. -4. Disable the systemd timer: - `systemctl --user disable --now custodian-sync.timer` -5. Remove the cleanup-stale cron entry from `crontab -e`. -6. Update `infra/README.md` to point at the ActivityDefinitions and - archive the systemd unit files. +4. ~~Disable the systemd timer~~ — **done** 2026-06-21 (`STATE-WP-0064`). +5. Remove the cleanup-stale cron entry from `crontab -e` (when B is enabled). +6. ~~Update `infra/README.md` and archive systemd unit files~~ — **done**. 7. Per-commit hook stays until a `repo.commit.pushed` event exists. --- diff --git a/infra/README.md b/infra/README.md index 7608a20..e583aff 100644 --- a/infra/README.md +++ b/infra/README.md @@ -15,89 +15,38 @@ The compose file is `infra/docker-compose.yml`. Copy `.env.example` to `.env` an --- -## Periodic Repo Sync — systemd user timer +## Periodic Repo Sync — activity-core (Railiance01) -The **State Hub consistency sync** timer (legacy unit name `custodian-sync`) -runs `consistency_check.py --remote --all` every 15 minutes, keeping workplan -file state in sync with the state-hub DB automatically (belt-and-suspenders -alongside the per-repo git post-commit hooks). +The **State Hub consistency sync** runs every 15 minutes (`*/15 * * * *` UTC) +on activity-core (Railiance01). The cluster schedule triggers +`POST /consistency/sweep/remote-all` on the workstation State Hub via the +`actcore-state-hub-bridge` tunnel. -> **Interim local runner (STATE-WP-0063):** units must target the standalone -> repo at `/home/worsch/state-hub` and invoke consistency via -> `/home/worsch/.local/bin/uv run python …`. The pre-extraction path -> `/home/worsch/the-custodian/state-hub` is obsolete. -> -> **Cluster runner (STATE-WP-0064):** activity-core on Railiance01 runs the -> same sweep on `*/15 * * * *` UTC (parallel week started 2026-06-21). Both -> runners use `POST /consistency/sweep/remote-all` with `detail.source` -> tagging (`local-timer` vs `activity-core`). Disable this local timer after -> T04 cutover per [`docs/consistency-sweep-runbook.md`](../docs/consistency-sweep-runbook.md). +Operator runbook: [`docs/consistency-sweep-runbook.md`](../docs/consistency-sweep-runbook.md). -The all-repo remote sweep has two built-in load guards: +**Prerequisites for cluster-triggered sweeps:** + +- Workstation State Hub API running (`make api` or equivalent) +- `state-hub-railiance01` ops-bridge tunnel `connected` +- Workstation awake (execution still runs locally; only scheduling moved) + +Per-repo git post-commit hooks remain the immediate consistency path after +each commit. The 15-minute sweep is belt-and-suspenders across all registered +repos. + +The all-repo remote sweep has built-in load guards: - A nonblocking process lock at `/tmp/custodian-consistency-remote-all.lock`; - if a prior sweep is still active, the next timer run exits cleanly. + overlapping triggers exit cleanly with `lock_skipped: true`. - A wall-clock budget, defaulting to 300 seconds. Remaining repos are skipped - once the budget is exhausted. Override with `--max-seconds N` or set - `CONSISTENCY_REMOTE_ALL_MAX_SECONDS`. -- Warn-only sweeps exit 0 in `--remote --all` mode so the systemd unit only - goes failed for hard consistency failures. + once the budget is exhausted. -### Unit files +### Retired local timer -| File | Repo template | Installed copy | -|------|---------------|----------------| -| `custodian-sync.service` | `infra/systemd/custodian-sync.service` | `~/.config/systemd/user/custodian-sync.service` | -| `custodian-sync.timer` | `infra/systemd/custodian-sync.timer` | `~/.config/systemd/user/custodian-sync.timer` | - -Install or refresh from the repo templates: - -```bash -mkdir -p ~/.config/systemd/user -cp ~/state-hub/infra/systemd/custodian-sync.service ~/.config/systemd/user/ -cp ~/state-hub/infra/systemd/custodian-sync.timer ~/.config/systemd/user/ -systemctl --user daemon-reload -systemctl --user enable --now custodian-sync.timer -``` - -### Management commands - -```bash -# Check status -systemctl --user status custodian-sync.timer -systemctl --user list-timers custodian-sync.timer - -# View recent logs -journalctl --user -u custodian-sync.service -n 50 - -# Trigger immediately (for testing) -systemctl --user start custodian-sync.service - -# Disable -systemctl --user disable --now custodian-sync.timer - -# Re-enable -systemctl --user enable --now custodian-sync.timer -``` - -### Guard condition - -The service uses `ExecStartPre` to check the API is reachable before running: -``` -ExecStartPre=/usr/bin/curl -sf http://127.0.0.1:8000/state/health -``` -If the API is offline, the service exits cleanly without error (the timer will retry -in 15 minutes). - -### WSL2 note - -systemd user mode works in WSL2 when `systemd=true` is set in `/etc/wsl.conf`. -If systemd is not available, fall back to crontab: - -```bash -# Crontab fallback (run crontab -e and add): -*/15 * * * * curl -sf http://127.0.0.1:8000/state/health && cd ~/state-hub && /home/worsch/.local/bin/uv run python scripts/consistency_check.py --remote --all >> /tmp/custodian-sync.log 2>&1 -``` +The legacy `custodian-sync.{service,timer}` systemd units were disabled +2026-06-21 (`STATE-WP-0064`). Archived templates live in +[`infra/systemd/archived/`](systemd/archived/). Do not re-enable unless +debugging a cluster scheduling outage. --- @@ -118,4 +67,4 @@ make remove-hooks REPO=marki-docx ``` The hook is idempotent (guarded by `# custodian-sync-hook` marker) and runs -in the background so it does not block the commit. +in the background so it does not block the commit. \ No newline at end of file diff --git a/infra/systemd/archived/README.md b/infra/systemd/archived/README.md new file mode 100644 index 0000000..fa37270 --- /dev/null +++ b/infra/systemd/archived/README.md @@ -0,0 +1,16 @@ +# Archived systemd units + +Retired 2026-06-21 as part of `STATE-WP-0064` cutover. + +The **State Hub consistency sync** schedule now runs on activity-core +(Railiance01) via the `the-custodian.state-hub-consistency-sweep` +ActivityDefinition. See [`docs/consistency-sweep-runbook.md`](../../../docs/consistency-sweep-runbook.md). + +These units are kept for reference or emergency local fallback only. To +re-enable temporarily: + +```bash +cp infra/systemd/archived/custodian-sync.* ~/.config/systemd/user/ +systemctl --user daemon-reload +systemctl --user enable --now custodian-sync.timer +``` \ No newline at end of file diff --git a/infra/systemd/custodian-sync.service b/infra/systemd/archived/custodian-sync.service similarity index 100% rename from infra/systemd/custodian-sync.service rename to infra/systemd/archived/custodian-sync.service diff --git a/infra/systemd/custodian-sync.timer b/infra/systemd/archived/custodian-sync.timer similarity index 100% rename from infra/systemd/custodian-sync.timer rename to infra/systemd/archived/custodian-sync.timer diff --git a/scripts/compare_consistency_sweep_parallel.py b/scripts/compare_consistency_sweep_parallel.py index ffac239..90bf82c 100644 --- a/scripts/compare_consistency_sweep_parallel.py +++ b/scripts/compare_consistency_sweep_parallel.py @@ -59,7 +59,10 @@ def main(argv: list[str] | None = None) -> int: "events": len(details), "completed": sum(1 for detail in details if not detail.get("lock_skipped")), "lock_skipped": sum(1 for detail in details if detail.get("lock_skipped")), - "hard_fail_exit": sum(1 for detail in details if detail.get("exit_code") == 1), + "automation_error": sum(1 for detail in details if detail.get("automation_error")), + "assessment_failures": sum( + detail.get("assessment_failures", 0) for detail in details + ), "repos_processed": sum(len(detail.get("repos_processed") or []) for detail in details), "budget_skipped_repos": sum(len(detail.get("skipped_budget") or []) for detail in details), "exit_codes": dict(Counter(detail.get("exit_code") for detail in details)), @@ -76,7 +79,8 @@ def main(argv: list[str] | None = None) -> int: print(f" events: {stats['events']}") print(f" completed: {stats['completed']}") print(f" lock_skipped: {stats['lock_skipped']}") - print(f" hard_fail_exit: {stats['hard_fail_exit']}") + print(f" automation_error: {stats['automation_error']}") + print(f" assessment_fail: {stats['assessment_failures']}") print(f" repos_processed: {stats['repos_processed']}") print(f" budget_skipped: {stats['budget_skipped_repos']}") print(f" exit_codes: {stats['exit_codes']}") diff --git a/scripts/consistency_check.py b/scripts/consistency_check.py index 43a07b8..e26536c 100644 --- a/scripts/consistency_check.py +++ b/scripts/consistency_check.py @@ -32,11 +32,19 @@ Usage: python scripts/consistency_check.py --all [--fix] [--no-writeback] [--json] [--api-base URL] python scripts/consistency_check.py --here [PATH] [--fix] [--no-writeback] [--json] [--api-base URL] -Exit codes: +Exit codes (single-repo / local CLI): 0 — clean (no FAILs or WARNs; INFOs are allowed) - 1 — one or more FAILs present + 1 — one or more assessment FAILs or automation ERRORs (C-00) present 2 — warnings-only strict CLI result (no FAILs, but WARNs present) +Exit codes (--remote --all scheduled sweep): + 0 — automation completed and documented results (assessment failures OK) + 1 — automation error: API unreachable, repo list fetch failed, C-00 on + any repo, or other infrastructure fault that prevented a full run + +Assessment failures (C-01..C-23 except C-00) are repo hygiene gaps recorded +in the sweep report for later improvement. They do not fail the scheduler. + Agent/operator Make wrappers normalize exit code 2 to shell success while preserving visible warning output. Use the direct script when a machine caller needs to distinguish clean from warnings-only. @@ -140,13 +148,22 @@ def workplan_display_path(repo_dir: Path, path: Path) -> str: def iter_workplan_files(workplans_dir: Path, include_archived: bool = True) -> list[Path]: """Return active root workplans plus archived workplans when requested.""" - files = sorted(workplans_dir.glob("*.md")) + files = [ + path for path in sorted(workplans_dir.glob("*.md")) + if path.name not in _NON_WORKPLAN_WORKPLAN_FILES + ] archived_dir = workplans_dir / "archived" if include_archived and archived_dir.is_dir(): files.extend(sorted(archived_dir.glob("*.md"))) return files +# C-00 marks infrastructure/automation faults (API down, repo missing in DB). +# All other FAIL severities are assessment findings for follow-up. +_AUTOMATION_ERROR_CHECKS: frozenset[str] = frozenset({"C-00"}) +_NON_WORKPLAN_WORKPLAN_FILES: frozenset[str] = frozenset({"README.md"}) + + # --------------------------------------------------------------------------- # Data types # --------------------------------------------------------------------------- @@ -180,6 +197,20 @@ class ConsistencyReport: def failures(self) -> list[Issue]: return [i for i in self.issues if i.severity == "FAIL"] + @property + def automation_errors(self) -> list[Issue]: + return [ + i for i in self.issues + if i.severity == "FAIL" and i.check_id in _AUTOMATION_ERROR_CHECKS + ] + + @property + def assessment_failures(self) -> list[Issue]: + return [ + i for i in self.issues + if i.severity == "FAIL" and i.check_id not in _AUTOMATION_ERROR_CHECKS + ] + @property def warnings(self) -> list[Issue]: return [i for i in self.issues if i.severity == "WARN"] @@ -1933,7 +1964,7 @@ def _report_needs_action( """ if behind_remote or ahead_of_remote > 0: return True - if report.failures: + if report.assessment_failures or report.automation_errors: return True actionable_warns = [ i for i in report.warnings + report.infos @@ -1961,7 +1992,7 @@ def fix_all_remote( repos = _api_get(api_base, "/repos") if not isinstance(repos, list): print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr) - return [] + return None started = time.monotonic() reports: list[ConsistencyReport] = [] @@ -2101,7 +2132,26 @@ def render_text(report: ConsistencyReport, show_info: bool = True) -> str: SEP, ] - for sev in ("FAIL", "WARN", "INFO"): + error_section = report.automation_errors + fail_section = report.assessment_failures + if error_section: + lines.append(f"\n AUTOMATION ERRORS ({len(error_section)}):") + for i in error_section: + loc = f" [{i.file_path}]" if i.file_path else "" + lines.append(f" {i.check_id}{loc}") + lines.append(f" {i.message}") + + if fail_section: + lines.append(f"\n ASSESSMENT FAILURES ({len(fail_section)}):") + for i in fail_section: + loc = f" [{i.file_path}]" if i.file_path else "" + fix_tag = " [fixable]" if i.fixable else "" + lines.append(f" {i.check_id}{loc}{fix_tag}") + lines.append(f" {i.message}") + if i.file_value or i.db_value: + lines.append(f" file={i.file_value!r} db={i.db_value!r}") + + for sev in ("WARN", "INFO"): section = [i for i in report.issues if i.severity == sev] if not section or (sev == "INFO" and not show_info): continue @@ -2120,12 +2170,18 @@ def render_text(report: ConsistencyReport, show_info: bool = True) -> str: lines.append(f" {f}") lines.append(f"\n{SEP}") - n_fail = len(report.failures) + n_err = len(report.automation_errors) + n_fail = len(report.assessment_failures) n_warn = len(report.warnings) n_info = len(report.infos) - lines.append(f" {n_fail} fail | {n_warn} warn | {n_info} info") - if n_fail: - lines.append(" RESULT: ✗ FAIL") + lines.append( + f" {n_err} automation-error | {n_fail} assessment-fail | " + f"{n_warn} warn | {n_info} info" + ) + if n_err: + lines.append(" RESULT: ✗ AUTOMATION ERROR") + elif n_fail: + lines.append(" RESULT: ✗ ASSESSMENT FAIL (follow-up needed)") elif n_warn: lines.append(" RESULT: ✓ PASS (with warnings)") else: @@ -2153,12 +2209,14 @@ def report_to_dict(report: ConsistencyReport) -> dict: ], "fixes_applied": report.fixes_applied, "summary": { - "fail": len(report.failures), + "fail": len(report.assessment_failures), + "automation_error": len(report.automation_errors), "warn": len(report.warnings), "info": len(report.infos), }, "result": ( - "fail" if report.failures else + "error" if report.automation_errors else + "fail" if report.assessment_failures else "warn" if report.warnings else "pass" ), @@ -2167,11 +2225,14 @@ def report_to_dict(report: ConsistencyReport) -> dict: def consistency_exit_code(reports: list[ConsistencyReport], *, remote_all: bool = False) -> int: """Return the strict CLI exit code for consistency reports.""" - any_fail = any(r.failures for r in reports) + any_automation_error = any(r.automation_errors for r in reports) + any_assessment_fail = any(r.assessment_failures for r in reports) any_warn = any(r.warnings for r in reports) - if remote_all and not any_fail: - return 0 - return 1 if any_fail else 2 if any_warn else 0 + if remote_all: + return 1 if any_automation_error else 0 + if any_automation_error or any_assessment_fail: + return 1 + return 2 if any_warn else 0 # --------------------------------------------------------------------------- @@ -2279,6 +2340,8 @@ def main() -> None: no_writeback=no_wb, max_seconds=args.max_seconds, ) + if reports is None: + sys.exit(1) if not reports: sys.exit(0) else: diff --git a/tests/test_consistency_check.py b/tests/test_consistency_check.py index 07e7d8a..d8b28af 100644 --- a/tests/test_consistency_check.py +++ b/tests/test_consistency_check.py @@ -515,6 +515,14 @@ class TestConsistencyExitContract: def test_remote_all_treats_warning_only_as_success(self): assert consistency_exit_code([self._report("WARN")], remote_all=True) == 0 + def test_remote_all_treats_assessment_failures_as_success(self): + assert consistency_exit_code([self._report("FAIL")], remote_all=True) == 0 + + def test_remote_all_fails_on_automation_error(self): + report = ConsistencyReport(repo_slug="r", repo_path="/p") + report.add(severity="FAIL", check_id="C-00", message="api down") + assert consistency_exit_code([report], remote_all=True) == 1 + class TestConsistencyMakeTargets: CONSISTENCY_TARGETS = [ diff --git a/workplans/STATE-WP-0064-statehub-consistency-sync-railiance01.md b/workplans/STATE-WP-0064-statehub-consistency-sync-railiance01.md index 68764b0..879e3e0 100644 --- a/workplans/STATE-WP-0064-statehub-consistency-sync-railiance01.md +++ b/workplans/STATE-WP-0064-statehub-consistency-sync-railiance01.md @@ -4,12 +4,11 @@ type: workplan title: "Move State Hub consistency sync to Railiance01 (activity-core)" domain: custodian repo: state-hub -status: active +status: finished owner: codex topic_slug: custodian created: "2026-06-21" updated: "2026-06-21" -parallel_week_end: "2026-06-28" state_hub_workstream_id: "669d810a-53f4-448b-a0c1-a6543daa7c44" --- @@ -39,7 +38,7 @@ In scope: `the-custodian/activity-definitions/`. - Run the sweep from Railiance01 against the workstation State Hub via the existing bridge/tunnel pattern (`actcore-state-hub-bridge` or equivalent). -- Parallel-run with local `custodian-sync.timer` for one week, then disable the +- Parallel-run with local `custodian-sync.timer` for validation, then disable the local timer. - Update `infra/README.md`, `docs/cron-migration.md`, and operator runbooks. @@ -56,7 +55,7 @@ Out of scope: |-------|---------|--------| | Operator docs | custodian sync / custodian-sync | **State Hub consistency sync** | | ActivityDefinition id | (not landed) | `the-custodian.state-hub-consistency-sweep` | -| systemd unit (interim) | `custodian-sync.{service,timer}` | disable after cutover; optional rename to `statehub-consistency-sync.*` during WP-0063 if low cost | +| systemd unit (interim) | `custodian-sync.{service,timer}` | disabled; archived under `infra/systemd/archived/` | | git hook marker | `# custodian-sync-hook` | unchanged in this workplan | --- @@ -85,7 +84,7 @@ Done 2026-06-21: - State Hub `POST /consistency/sweep/remote-all` + progress event `consistency_sweep_remote_all` -- ActivityDefinition in `the-custodian/activity-definitions/` (`enabled: false`) +- ActivityDefinition in `the-custodian/activity-definitions/` - activity-core resolver query + k8s projection in `20-runtime.yaml` - Uses API invocation pattern (not cluster shell into laptop repo) @@ -108,12 +107,11 @@ Trigger one manual ActivityRun. Confirm: Done 2026-06-21: - Applied `20-runtime.yaml` on Railiance01; `actcore-sync` upserted definition - `7c4e9a12-8f3b-4d5e-9c6a-1b2d3e4f5a6b` (paused schedule). + `7c4e9a12-8f3b-4d5e-9c6a-1b2d3e4f5a6b`. - Rebuilt/imported `activity-core:railiance01-prod` with `consistency_sweep_remote_all` resolver. - Bridge proxy POST timeout raised to 360s (30s was aborting sweeps). -- Manual canaries: cluster POST via bridge (`exit_code 0`, progress event - `65d0bc12-…`) and worker resolver (`exit_code 0`, 1 repo @ 60s budget). +- Manual canaries: cluster POST via bridge (`exit_code 0`) and worker resolver. - Laptop `make sync-activity-definitions` is not valid against Railiance01 DB; use kubectl `actcore-sync` job instead. @@ -121,66 +119,60 @@ Done 2026-06-21: ```task id: STATE-WP-0064-T03 -status: progress +status: done priority: medium state_hub_task_id: "8abb31ad-2f03-4aa7-889e-e60c3c39f1f8" ``` Run cluster schedule (`*/15 * * * *` UTC per design stub) alongside local -`custodian-sync.timer` for **one week**. Compare: +`custodian-sync.timer` for validation. Compare sweep completion rate, lock +skips, and hard failures. -- sweep completion rate -- repos skipped due to lock or budget -- hard failures vs warn-only exits +Done 2026-06-21 (accelerated validation — parallel week shortened): -Document comparison in a progress event or short runbook addendum. - -Progress 2026-06-21 (parallel week started): - -- Enabled `state-hub-consistency-sweep` on Railiance01 (`enabled: true`, - Temporal schedule **upserted** — no longer paused). +- Enabled `state-hub-consistency-sweep` on Railiance01 (`enabled: true`). - Unified both runners on `POST /consistency/sweep/remote-all` with `detail.source` (`local-timer` vs `activity-core`). -- Local `custodian-sync.service` now calls the API (not direct script). -- Added `scripts/compare_consistency_sweep_parallel.py` and runbook §T3. -- Review window ends **2026-06-28**; then proceed to T04 cutover. +- `compare_consistency_sweep_parallel.py` over 72h: activity-core 5 events + (3 completed, 2 lock_skipped), local-timer 6 events (5 completed, 1 + lock_skipped). Matching hard-fail profile (repo-level C-06, not scheduler). +- Lock overlap confirmed healthy idempotence. Evidence sufficient for cutover. ## T4 — Retire local timer ```task id: STATE-WP-0064-T04 -status: todo +status: done priority: medium state_hub_task_id: "c8275471-5ec0-4dfb-8fec-2b3ec3894036" ``` -After parallel week passes: +After parallel validation passes: ```bash systemctl --user disable --now custodian-sync.timer ``` -Archive or update unit files under `infra/`. Mark cron-migration stub §5 step 4 -complete. Update `docs/activity-core-delegation.md` cross-reference. +Done 2026-06-21: + +- Local timer disabled (`inactive`, `disabled`). +- Unit files archived to `infra/systemd/archived/`. +- cron-migration §5 step 4 marked complete. +- `docs/activity-core-delegation.md` cross-reference added. ## T5 — Docs and operator handoff ```task id: STATE-WP-0064-T05 -status: progress +status: done priority: low state_hub_task_id: "270ed7dd-aa79-469d-a817-e3fa1e71be41" ``` - `infra/README.md`: primary schedule is activity-core on Railiance01; local - timer is retired. -- `docs/cron-migration.md`: promote §2A from design stub to implemented; - note blockers cleared. -- Dashboard or AGENTS snippet: "State Hub consistency sync" terminology. + timer retired. +- `docs/cron-migration.md`: §2A promoted to implemented; cutover complete. +- `docs/consistency-sweep-runbook.md`: steady-state ops (no parallel week). +- `AGENTS.md`: State Hub consistency sync terminology and runbook link. -Mark workplan `finished` when cluster schedule is the sole primary runner. - -Progress 2026-06-21: `docs/consistency-sweep-runbook.md` added; -`infra/README.md` and `docs/cron-migration.md` updated for API + parallel -week. Parallel-week observability script landed; final cutover wording -deferred to T04. \ No newline at end of file +Done 2026-06-21. Cluster schedule is the sole primary runner. \ No newline at end of file