finish(STATE-WP-0064): cut over scheduler and split sweep errors from failures

STATE-WP-0064 cutover (state-hub only):
- Retire local custodian-sync.timer; archive units under infra/systemd/archived/
- Mark workplan finished; update infra/README, cron-migration, runbook, AGENTS.md
- Point activity-core-delegation at the consistency-sweep runbook

Consistency engine — automation error vs assessment failure:
- C-00 is an automation error; C-01..C-23 assessment failures are recorded
  for follow-up but no longer fail --remote --all scheduled sweeps (exit 0)
- Skip workplans/README.md in the workplan glob (human index, not a workplan)
- Progress events and compare script expose automation_error and
  assessment_failures separately from exit_code
This commit is contained in:
2026-06-22 01:20:59 +02:00
parent 270033a50d
commit 39ed5459b9
14 changed files with 221 additions and 180 deletions

View File

@@ -59,7 +59,10 @@ def main(argv: list[str] | None = None) -> int:
"events": len(details),
"completed": sum(1 for detail in details if not detail.get("lock_skipped")),
"lock_skipped": sum(1 for detail in details if detail.get("lock_skipped")),
"hard_fail_exit": sum(1 for detail in details if detail.get("exit_code") == 1),
"automation_error": sum(1 for detail in details if detail.get("automation_error")),
"assessment_failures": sum(
detail.get("assessment_failures", 0) for detail in details
),
"repos_processed": sum(len(detail.get("repos_processed") or []) for detail in details),
"budget_skipped_repos": sum(len(detail.get("skipped_budget") or []) for detail in details),
"exit_codes": dict(Counter(detail.get("exit_code") for detail in details)),
@@ -76,7 +79,8 @@ def main(argv: list[str] | None = None) -> int:
print(f" events: {stats['events']}")
print(f" completed: {stats['completed']}")
print(f" lock_skipped: {stats['lock_skipped']}")
print(f" hard_fail_exit: {stats['hard_fail_exit']}")
print(f" automation_error: {stats['automation_error']}")
print(f" assessment_fail: {stats['assessment_failures']}")
print(f" repos_processed: {stats['repos_processed']}")
print(f" budget_skipped: {stats['budget_skipped_repos']}")
print(f" exit_codes: {stats['exit_codes']}")

View File

@@ -32,11 +32,19 @@ Usage:
python scripts/consistency_check.py --all [--fix] [--no-writeback] [--json] [--api-base URL]
python scripts/consistency_check.py --here [PATH] [--fix] [--no-writeback] [--json] [--api-base URL]
Exit codes:
Exit codes (single-repo / local CLI):
0 — clean (no FAILs or WARNs; INFOs are allowed)
1 — one or more FAILs present
1 — one or more assessment FAILs or automation ERRORs (C-00) present
2 — warnings-only strict CLI result (no FAILs, but WARNs present)
Exit codes (--remote --all scheduled sweep):
0 — automation completed and documented results (assessment failures OK)
1 — automation error: API unreachable, repo list fetch failed, C-00 on
any repo, or other infrastructure fault that prevented a full run
Assessment failures (C-01..C-23 except C-00) are repo hygiene gaps recorded
in the sweep report for later improvement. They do not fail the scheduler.
Agent/operator Make wrappers normalize exit code 2 to shell success while
preserving visible warning output. Use the direct script when a machine caller
needs to distinguish clean from warnings-only.
@@ -140,13 +148,22 @@ def workplan_display_path(repo_dir: Path, path: Path) -> str:
def iter_workplan_files(workplans_dir: Path, include_archived: bool = True) -> list[Path]:
"""Return active root workplans plus archived workplans when requested."""
files = sorted(workplans_dir.glob("*.md"))
files = [
path for path in sorted(workplans_dir.glob("*.md"))
if path.name not in _NON_WORKPLAN_WORKPLAN_FILES
]
archived_dir = workplans_dir / "archived"
if include_archived and archived_dir.is_dir():
files.extend(sorted(archived_dir.glob("*.md")))
return files
# C-00 marks infrastructure/automation faults (API down, repo missing in DB).
# All other FAIL severities are assessment findings for follow-up.
_AUTOMATION_ERROR_CHECKS: frozenset[str] = frozenset({"C-00"})
_NON_WORKPLAN_WORKPLAN_FILES: frozenset[str] = frozenset({"README.md"})
# ---------------------------------------------------------------------------
# Data types
# ---------------------------------------------------------------------------
@@ -180,6 +197,20 @@ class ConsistencyReport:
def failures(self) -> list[Issue]:
return [i for i in self.issues if i.severity == "FAIL"]
@property
def automation_errors(self) -> list[Issue]:
return [
i for i in self.issues
if i.severity == "FAIL" and i.check_id in _AUTOMATION_ERROR_CHECKS
]
@property
def assessment_failures(self) -> list[Issue]:
return [
i for i in self.issues
if i.severity == "FAIL" and i.check_id not in _AUTOMATION_ERROR_CHECKS
]
@property
def warnings(self) -> list[Issue]:
return [i for i in self.issues if i.severity == "WARN"]
@@ -1933,7 +1964,7 @@ def _report_needs_action(
"""
if behind_remote or ahead_of_remote > 0:
return True
if report.failures:
if report.assessment_failures or report.automation_errors:
return True
actionable_warns = [
i for i in report.warnings + report.infos
@@ -1961,7 +1992,7 @@ def fix_all_remote(
repos = _api_get(api_base, "/repos")
if not isinstance(repos, list):
print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr)
return []
return None
started = time.monotonic()
reports: list[ConsistencyReport] = []
@@ -2101,7 +2132,26 @@ def render_text(report: ConsistencyReport, show_info: bool = True) -> str:
SEP,
]
for sev in ("FAIL", "WARN", "INFO"):
error_section = report.automation_errors
fail_section = report.assessment_failures
if error_section:
lines.append(f"\n AUTOMATION ERRORS ({len(error_section)}):")
for i in error_section:
loc = f" [{i.file_path}]" if i.file_path else ""
lines.append(f" {i.check_id}{loc}")
lines.append(f" {i.message}")
if fail_section:
lines.append(f"\n ASSESSMENT FAILURES ({len(fail_section)}):")
for i in fail_section:
loc = f" [{i.file_path}]" if i.file_path else ""
fix_tag = " [fixable]" if i.fixable else ""
lines.append(f" {i.check_id}{loc}{fix_tag}")
lines.append(f" {i.message}")
if i.file_value or i.db_value:
lines.append(f" file={i.file_value!r} db={i.db_value!r}")
for sev in ("WARN", "INFO"):
section = [i for i in report.issues if i.severity == sev]
if not section or (sev == "INFO" and not show_info):
continue
@@ -2120,12 +2170,18 @@ def render_text(report: ConsistencyReport, show_info: bool = True) -> str:
lines.append(f" {f}")
lines.append(f"\n{SEP}")
n_fail = len(report.failures)
n_err = len(report.automation_errors)
n_fail = len(report.assessment_failures)
n_warn = len(report.warnings)
n_info = len(report.infos)
lines.append(f" {n_fail} fail | {n_warn} warn | {n_info} info")
if n_fail:
lines.append(" RESULT: ✗ FAIL")
lines.append(
f" {n_err} automation-error | {n_fail} assessment-fail | "
f"{n_warn} warn | {n_info} info"
)
if n_err:
lines.append(" RESULT: ✗ AUTOMATION ERROR")
elif n_fail:
lines.append(" RESULT: ✗ ASSESSMENT FAIL (follow-up needed)")
elif n_warn:
lines.append(" RESULT: ✓ PASS (with warnings)")
else:
@@ -2153,12 +2209,14 @@ def report_to_dict(report: ConsistencyReport) -> dict:
],
"fixes_applied": report.fixes_applied,
"summary": {
"fail": len(report.failures),
"fail": len(report.assessment_failures),
"automation_error": len(report.automation_errors),
"warn": len(report.warnings),
"info": len(report.infos),
},
"result": (
"fail" if report.failures else
"error" if report.automation_errors else
"fail" if report.assessment_failures else
"warn" if report.warnings else
"pass"
),
@@ -2167,11 +2225,14 @@ def report_to_dict(report: ConsistencyReport) -> dict:
def consistency_exit_code(reports: list[ConsistencyReport], *, remote_all: bool = False) -> int:
"""Return the strict CLI exit code for consistency reports."""
any_fail = any(r.failures for r in reports)
any_automation_error = any(r.automation_errors for r in reports)
any_assessment_fail = any(r.assessment_failures for r in reports)
any_warn = any(r.warnings for r in reports)
if remote_all and not any_fail:
return 0
return 1 if any_fail else 2 if any_warn else 0
if remote_all:
return 1 if any_automation_error else 0
if any_automation_error or any_assessment_fail:
return 1
return 2 if any_warn else 0
# ---------------------------------------------------------------------------
@@ -2279,6 +2340,8 @@ def main() -> None:
no_writeback=no_wb,
max_seconds=args.max_seconds,
)
if reports is None:
sys.exit(1)
if not reports:
sys.exit(0)
else: