finish(STATE-WP-0064): cut over scheduler and split sweep errors from failures

STATE-WP-0064 cutover (state-hub only):
- Retire local custodian-sync.timer; archive units under infra/systemd/archived/
- Mark workplan finished; update infra/README, cron-migration, runbook, AGENTS.md
- Point activity-core-delegation at the consistency-sweep runbook

Consistency engine — automation error vs assessment failure:
- C-00 is an automation error; C-01..C-23 assessment failures are recorded
  for follow-up but no longer fail --remote --all scheduled sweeps (exit 0)
- Skip workplans/README.md in the workplan glob (human index, not a workplan)
- Progress events and compare script expose automation_error and
  assessment_failures separately from exit_code
This commit is contained in:
2026-06-22 01:20:59 +02:00
parent 270033a50d
commit 39ed5459b9
14 changed files with 221 additions and 180 deletions

View File

@@ -8,6 +8,7 @@ from pydantic import BaseModel, Field
class ConsistencySweepIssueSummary(BaseModel):
fail: int = 0
automation_error: int = 0
warn: int = 0
info: int = 0
@@ -39,6 +40,7 @@ class ConsistencySweepRemoteAllRun(BaseModel):
max_seconds: int
source: str
exit_code: int
automation_error: bool = False
lock_skipped: bool
repos_processed: list[ConsistencySweepRepoResult] = Field(default_factory=list)
skipped_clean: list[str] = Field(default_factory=list)

View File

@@ -83,6 +83,7 @@ def _parse_stdout(stdout: str) -> list[ConsistencySweepRepoResult]:
result=str(item.get("result") or "pass"),
summary=ConsistencySweepIssueSummary(
fail=int(summary.get("fail", 0)),
automation_error=int(summary.get("automation_error", 0)),
warn=int(summary.get("warn", 0)),
info=int(summary.get("info", 0)),
),
@@ -121,6 +122,7 @@ async def run_remote_all_sweep(
stderr_meta = _parse_stderr(result.stderr)
repos_processed = [] if lock_skipped else _parse_stdout(result.stdout)
automation_error = result.returncode != 0 and not lock_skipped
progress_event_id = await _log_sweep_progress(
session,
started_at=started_at,
@@ -128,6 +130,7 @@ async def run_remote_all_sweep(
max_seconds=max_seconds,
source=source,
exit_code=result.returncode,
automation_error=automation_error,
lock_skipped=lock_skipped,
repos_processed=repos_processed,
**stderr_meta,
@@ -138,6 +141,7 @@ async def run_remote_all_sweep(
max_seconds=max_seconds,
source=source,
exit_code=result.returncode,
automation_error=automation_error,
lock_skipped=lock_skipped,
repos_processed=repos_processed,
skipped_clean=stderr_meta["skipped_clean"],
@@ -155,6 +159,7 @@ async def _log_sweep_progress(
max_seconds: int,
source: str,
exit_code: int,
automation_error: bool,
lock_skipped: bool,
repos_processed: list[ConsistencySweepRepoResult],
skipped_clean: list[str],
@@ -162,16 +167,23 @@ async def _log_sweep_progress(
skipped_budget: list[str],
) -> uuid.UUID:
processed_count = len(repos_processed)
fail_count = sum(1 for repo in repos_processed if repo.result == "fail")
error_count = sum(1 for repo in repos_processed if repo.result == "error")
assessment_fail_count = sum(1 for repo in repos_processed if repo.result == "fail")
warn_count = sum(1 for repo in repos_processed if repo.result == "warn")
if lock_skipped:
summary = "State Hub consistency sweep skipped: prior remote-all run still active"
elif automation_error:
summary = (
"State Hub consistency sweep automation error: "
f"exit_code={exit_code}, {processed_count} repos partially processed"
)
else:
summary = (
"State Hub consistency sweep completed: "
f"{processed_count} processed, {len(skipped_clean)} clean, "
f"{len(skipped_missing)} missing, {len(skipped_budget)} budget-skipped, "
f"{fail_count} failed, {warn_count} warned"
f"{assessment_fail_count} assessment-fail, {error_count} automation-error, "
f"{warn_count} warned"
)
event = ProgressEvent(
event_type="consistency_sweep_remote_all",
@@ -182,6 +194,9 @@ async def _log_sweep_progress(
"max_seconds": max_seconds,
"source": source,
"exit_code": exit_code,
"automation_error": automation_error,
"assessment_failures": assessment_fail_count,
"automation_errors": error_count,
"lock_skipped": lock_skipped,
"repos_processed": [item.model_dump(mode="json") for item in repos_processed],
"skipped_clean": skipped_clean,