finish(STATE-WP-0064): cut over scheduler and split sweep errors from failures

STATE-WP-0064 cutover (state-hub only): - Retire local custodian-sync.timer; archive units under infra/systemd/archived/ - Mark workplan finished; update infra/README, cron-migration, runbook, AGENTS.md - Point activity-core-delegation at the consistency-sweep runbook Consistency engine — automation error vs assessment failure: - C-00 is an automation error; C-01..C-23 assessment failures are recorded for follow-up but no longer fail --remote --all scheduled sweeps (exit 0) - Skip workplans/README.md in the workplan glob (human index, not a workplan) - Progress events and compare script expose automation_error and assessment_failures separately from exit_code
2026-06-22 01:20:59 +02:00
parent 270033a50d
commit 39ed5459b9
14 changed files with 221 additions and 180 deletions
--- a/scripts/compare_consistency_sweep_parallel.py
+++ b/scripts/compare_consistency_sweep_parallel.py
@@ -59,7 +59,10 @@ def main(argv: list[str] | None = None) -> int:
            "events": len(details),
            "completed": sum(1 for detail in details if not detail.get("lock_skipped")),
            "lock_skipped": sum(1 for detail in details if detail.get("lock_skipped")),
-            "hard_fail_exit": sum(1 for detail in details if detail.get("exit_code") == 1),
+            "automation_error": sum(1 for detail in details if detail.get("automation_error")),
+            "assessment_failures": sum(
+                detail.get("assessment_failures", 0) for detail in details
+            ),
            "repos_processed": sum(len(detail.get("repos_processed") or []) for detail in details),
            "budget_skipped_repos": sum(len(detail.get("skipped_budget") or []) for detail in details),
            "exit_codes": dict(Counter(detail.get("exit_code") for detail in details)),
@@ -76,7 +79,8 @@ def main(argv: list[str] | None = None) -> int:
        print(f"  events:          {stats['events']}")
        print(f"  completed:       {stats['completed']}")
        print(f"  lock_skipped:    {stats['lock_skipped']}")
-        print(f"  hard_fail_exit:  {stats['hard_fail_exit']}")
+        print(f"  automation_error: {stats['automation_error']}")
+        print(f"  assessment_fail: {stats['assessment_failures']}")
        print(f"  repos_processed: {stats['repos_processed']}")
        print(f"  budget_skipped:  {stats['budget_skipped_repos']}")
        print(f"  exit_codes:      {stats['exit_codes']}")
--- a/scripts/consistency_check.py
+++ b/scripts/consistency_check.py
@@ -32,11 +32,19 @@ Usage:
    python scripts/consistency_check.py --all [--fix] [--no-writeback] [--json] [--api-base URL]
    python scripts/consistency_check.py --here [PATH] [--fix] [--no-writeback] [--json] [--api-base URL]

-Exit codes:
+Exit codes (single-repo / local CLI):
    0 — clean (no FAILs or WARNs; INFOs are allowed)
-    1 — one or more FAILs present
+    1 — one or more assessment FAILs or automation ERRORs (C-00) present
    2 — warnings-only strict CLI result (no FAILs, but WARNs present)

+Exit codes (--remote --all scheduled sweep):
+    0 — automation completed and documented results (assessment failures OK)
+    1 — automation error: API unreachable, repo list fetch failed, C-00 on
+        any repo, or other infrastructure fault that prevented a full run
+
+Assessment failures (C-01..C-23 except C-00) are repo hygiene gaps recorded
+in the sweep report for later improvement. They do not fail the scheduler.
+
 Agent/operator Make wrappers normalize exit code 2 to shell success while
 preserving visible warning output. Use the direct script when a machine caller
 needs to distinguish clean from warnings-only.
@@ -140,13 +148,22 @@ def workplan_display_path(repo_dir: Path, path: Path) -> str:

 def iter_workplan_files(workplans_dir: Path, include_archived: bool = True) -> list[Path]:
    """Return active root workplans plus archived workplans when requested."""
-    files = sorted(workplans_dir.glob("*.md"))
+    files = [
+        path for path in sorted(workplans_dir.glob("*.md"))
+        if path.name not in _NON_WORKPLAN_WORKPLAN_FILES
+    ]
    archived_dir = workplans_dir / "archived"
    if include_archived and archived_dir.is_dir():
        files.extend(sorted(archived_dir.glob("*.md")))
    return files


+# C-00 marks infrastructure/automation faults (API down, repo missing in DB).
+# All other FAIL severities are assessment findings for follow-up.
+_AUTOMATION_ERROR_CHECKS: frozenset[str] = frozenset({"C-00"})
+_NON_WORKPLAN_WORKPLAN_FILES: frozenset[str] = frozenset({"README.md"})
+
+
 # ---------------------------------------------------------------------------
 # Data types
 # ---------------------------------------------------------------------------
@@ -180,6 +197,20 @@ class ConsistencyReport:
    def failures(self) -> list[Issue]:
        return [i for i in self.issues if i.severity == "FAIL"]

+    @property
+    def automation_errors(self) -> list[Issue]:
+        return [
+            i for i in self.issues
+            if i.severity == "FAIL" and i.check_id in _AUTOMATION_ERROR_CHECKS
+        ]
+
+    @property
+    def assessment_failures(self) -> list[Issue]:
+        return [
+            i for i in self.issues
+            if i.severity == "FAIL" and i.check_id not in _AUTOMATION_ERROR_CHECKS
+        ]
+
    @property
    def warnings(self) -> list[Issue]:
        return [i for i in self.issues if i.severity == "WARN"]
@@ -1933,7 +1964,7 @@ def _report_needs_action(
    """
    if behind_remote or ahead_of_remote > 0:
        return True
-    if report.failures:
+    if report.assessment_failures or report.automation_errors:
        return True
    actionable_warns = [
        i for i in report.warnings + report.infos
@@ -1961,7 +1992,7 @@ def fix_all_remote(
    repos = _api_get(api_base, "/repos")
    if not isinstance(repos, list):
        print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr)
-        return []
+        return None

    started = time.monotonic()
    reports: list[ConsistencyReport] = []
@@ -2101,7 +2132,26 @@ def render_text(report: ConsistencyReport, show_info: bool = True) -> str:
        SEP,
    ]

-    for sev in ("FAIL", "WARN", "INFO"):
+    error_section = report.automation_errors
+    fail_section = report.assessment_failures
+    if error_section:
+        lines.append(f"\n  AUTOMATION ERRORS ({len(error_section)}):")
+        for i in error_section:
+            loc = f"  [{i.file_path}]" if i.file_path else ""
+            lines.append(f"    {i.check_id}{loc}")
+            lines.append(f"      {i.message}")
+
+    if fail_section:
+        lines.append(f"\n  ASSESSMENT FAILURES ({len(fail_section)}):")
+        for i in fail_section:
+            loc = f"  [{i.file_path}]" if i.file_path else ""
+            fix_tag = " [fixable]" if i.fixable else ""
+            lines.append(f"    {i.check_id}{loc}{fix_tag}")
+            lines.append(f"      {i.message}")
+            if i.file_value or i.db_value:
+                lines.append(f"      file={i.file_value!r}  db={i.db_value!r}")
+
+    for sev in ("WARN", "INFO"):
        section = [i for i in report.issues if i.severity == sev]
        if not section or (sev == "INFO" and not show_info):
            continue
@@ -2120,12 +2170,18 @@ def render_text(report: ConsistencyReport, show_info: bool = True) -> str:
            lines.append(f"    {f}")

    lines.append(f"\n{SEP}")
-    n_fail = len(report.failures)
+    n_err = len(report.automation_errors)
+    n_fail = len(report.assessment_failures)
    n_warn = len(report.warnings)
    n_info = len(report.infos)
-    lines.append(f"  {n_fail} fail | {n_warn} warn | {n_info} info")
-    if n_fail:
-        lines.append("  RESULT: ✗ FAIL")
+    lines.append(
+        f"  {n_err} automation-error | {n_fail} assessment-fail | "
+        f"{n_warn} warn | {n_info} info"
+    )
+    if n_err:
+        lines.append("  RESULT: ✗ AUTOMATION ERROR")
+    elif n_fail:
+        lines.append("  RESULT: ✗ ASSESSMENT FAIL (follow-up needed)")
    elif n_warn:
        lines.append("  RESULT: ✓ PASS (with warnings)")
    else:
@@ -2153,12 +2209,14 @@ def report_to_dict(report: ConsistencyReport) -> dict:
        ],
        "fixes_applied": report.fixes_applied,
        "summary": {
-            "fail": len(report.failures),
+            "fail": len(report.assessment_failures),
+            "automation_error": len(report.automation_errors),
            "warn": len(report.warnings),
            "info": len(report.infos),
        },
        "result": (
-            "fail" if report.failures else
+            "error" if report.automation_errors else
+            "fail" if report.assessment_failures else
            "warn" if report.warnings else
            "pass"
        ),
@@ -2167,11 +2225,14 @@ def report_to_dict(report: ConsistencyReport) -> dict:

 def consistency_exit_code(reports: list[ConsistencyReport], *, remote_all: bool = False) -> int:
    """Return the strict CLI exit code for consistency reports."""
-    any_fail = any(r.failures for r in reports)
+    any_automation_error = any(r.automation_errors for r in reports)
+    any_assessment_fail = any(r.assessment_failures for r in reports)
    any_warn = any(r.warnings for r in reports)
-    if remote_all and not any_fail:
-        return 0
-    return 1 if any_fail else 2 if any_warn else 0
+    if remote_all:
+        return 1 if any_automation_error else 0
+    if any_automation_error or any_assessment_fail:
+        return 1
+    return 2 if any_warn else 0


 # ---------------------------------------------------------------------------
@@ -2279,6 +2340,8 @@ def main() -> None:
                no_writeback=no_wb,
                max_seconds=args.max_seconds,
            )
+        if reports is None:
+            sys.exit(1)
        if not reports:
            sys.exit(0)
    else: