Load limiting safeguards

This commit is contained in:
2026-05-06 04:04:53 +02:00
parent 47f6971c56
commit 2484ed2815
22 changed files with 374 additions and 144 deletions

View File

@@ -38,11 +38,14 @@ Exit codes:
from __future__ import annotations
import argparse
import os
import json
import re
import socket
import subprocess
import sys
import time
from contextlib import contextmanager
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
@@ -72,6 +75,7 @@ VALID_WP_STATUSES = {"active", "completed", "archived"}
VALID_TASK_STATUSES = {"todo", "in_progress", "blocked", "done", "cancelled"}
VALID_TASK_PRIORITIES = {"low", "medium", "high", "critical"}
VALID_DEP_RELATIONSHIPS = {"blocks", "starts_after", "informs", "soft_dependency"}
DEFAULT_REMOTE_ALL_MAX_SECONDS = int(os.environ.get("CONSISTENCY_REMOTE_ALL_MAX_SECONDS", "300"))
# Workplan files use task-style vocabulary ("done"); the DB workstream API uses
# "completed". This map translates file values to DB values before comparison
@@ -161,6 +165,37 @@ class ConsistencyReport:
return [i for i in self.issues if i.severity == "INFO"]
@contextmanager
def run_lock(name: str):
"""Hold a nonblocking process lock for long-running consistency modes."""
try:
import fcntl
except ImportError:
yield True
return
lock_path = Path(os.environ.get("CONSISTENCY_LOCK_DIR", "/tmp")) / f"custodian-{name}.lock"
lock_path.parent.mkdir(parents=True, exist_ok=True)
handle = lock_path.open("w", encoding="utf-8")
try:
try:
fcntl.flock(handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
except BlockingIOError:
yield False
return
handle.seek(0)
handle.truncate()
handle.write(f"{os.getpid()} {datetime.utcnow().isoformat()}Z\n")
handle.flush()
yield True
finally:
try:
fcntl.flock(handle, fcntl.LOCK_UN)
except OSError:
pass
handle.close()
# ---------------------------------------------------------------------------
# YAML / frontmatter parsing
# ---------------------------------------------------------------------------
@@ -1591,6 +1626,7 @@ def _report_needs_action(
def fix_all_remote(
api_base: str,
no_writeback: bool = False,
max_seconds: int = DEFAULT_REMOTE_ALL_MAX_SECONDS,
) -> list[ConsistencyReport]:
"""Pull-then-fix all registered repos that need attention.
@@ -1608,12 +1644,19 @@ def fix_all_remote(
print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr)
return []
started = time.monotonic()
reports: list[ConsistencyReport] = []
skipped_clean: list[str] = []
skipped_missing: list[str] = []
skipped_budget: list[str] = []
for repo in repos:
slug = repo["slug"]
if max_seconds > 0 and time.monotonic() - started > max_seconds:
skipped_budget.append(slug)
skipped_budget.extend(r.get("slug", "?") for r in repos[repos.index(repo) + 1:])
break
# Resolve path using the same priority as check_repo
path = resolve_repo_path(repo)
if not path or not Path(path).is_dir():
@@ -1646,7 +1689,12 @@ def fix_all_remote(
print(f" CLEAN (skipped): {', '.join(skipped_clean)}")
if skipped_missing:
print(f" NOT ON THIS HOST (skipped): {', '.join(skipped_missing)}")
if skipped_clean or skipped_missing:
if skipped_budget:
print(
f" BUDGET EXHAUSTED after {max_seconds}s (skipped): "
f"{', '.join(skipped_budget)}"
)
if skipped_clean or skipped_missing or skipped_budget:
print()
return reports
@@ -1803,6 +1851,9 @@ def main() -> None:
help="Pull each repo before fixing; when used with --all, skips repos "
"that are already clean (no actionable issues, not behind remote). "
"Implies --fix.")
parser.add_argument("--max-seconds", type=int, default=DEFAULT_REMOTE_ALL_MAX_SECONDS,
help="Wall-clock budget for --remote --all before remaining repos are skipped "
f"(default: {DEFAULT_REMOTE_ALL_MAX_SECONDS}; 0 disables)")
parser.add_argument("--no-writeback", action="store_true", dest="no_writeback",
help="Disable DB→file status writeback (C-15) while keeping other fixes")
parser.add_argument("--archive-closed", action="store_true",
@@ -1849,7 +1900,18 @@ def main() -> None:
reports[0].fixes_applied.extend(f"archive: {m}" for m in moved)
# --remote --all: smart pull+fix across all repos
elif args.remote and args.all:
reports = fix_all_remote(args.api_base, no_writeback=no_wb)
with run_lock("consistency-remote-all") as acquired:
if not acquired:
print(
"SKIP: another fix-consistency-remote --all run is already active",
file=sys.stderr,
)
sys.exit(0)
reports = fix_all_remote(
args.api_base,
no_writeback=no_wb,
max_seconds=args.max_seconds,
)
if not reports:
sys.exit(0)
else:
@@ -1915,6 +1977,8 @@ def main() -> None:
any_fail = any(r.failures for r in reports)
any_warn = any(r.warnings for r in reports)
if args.remote and args.all and not any_fail:
sys.exit(0)
sys.exit(1 if any_fail else 2 if any_warn else 0)

View File

@@ -104,6 +104,10 @@ ExecStart=… consistency_check.py --remote --all
2. Skips repos that are already clean (no issues, not behind, not ahead)
3. For repos needing action: `git pull --ff-only` first, then `fix_repo()` (which ends with T04 push)
It also holds `/tmp/custodian-consistency-remote-all.lock` for the duration of
the sweep and defaults to a 300-second wall-clock budget. These guards keep a
slow or stalled sweep from overlapping with the next 15-minute timer activation.
Previously `--all --fix` was used, which skipped the pull step and the clean-repo skip logic.
---