generated from coulomb/repo-seed
Load limiting safeguards
This commit is contained in:
@@ -38,11 +38,14 @@ Exit codes:
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import socket
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from contextlib import contextmanager
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
@@ -72,6 +75,7 @@ VALID_WP_STATUSES = {"active", "completed", "archived"}
|
||||
VALID_TASK_STATUSES = {"todo", "in_progress", "blocked", "done", "cancelled"}
|
||||
VALID_TASK_PRIORITIES = {"low", "medium", "high", "critical"}
|
||||
VALID_DEP_RELATIONSHIPS = {"blocks", "starts_after", "informs", "soft_dependency"}
|
||||
DEFAULT_REMOTE_ALL_MAX_SECONDS = int(os.environ.get("CONSISTENCY_REMOTE_ALL_MAX_SECONDS", "300"))
|
||||
|
||||
# Workplan files use task-style vocabulary ("done"); the DB workstream API uses
|
||||
# "completed". This map translates file values to DB values before comparison
|
||||
@@ -161,6 +165,37 @@ class ConsistencyReport:
|
||||
return [i for i in self.issues if i.severity == "INFO"]
|
||||
|
||||
|
||||
@contextmanager
|
||||
def run_lock(name: str):
|
||||
"""Hold a nonblocking process lock for long-running consistency modes."""
|
||||
try:
|
||||
import fcntl
|
||||
except ImportError:
|
||||
yield True
|
||||
return
|
||||
|
||||
lock_path = Path(os.environ.get("CONSISTENCY_LOCK_DIR", "/tmp")) / f"custodian-{name}.lock"
|
||||
lock_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
handle = lock_path.open("w", encoding="utf-8")
|
||||
try:
|
||||
try:
|
||||
fcntl.flock(handle, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except BlockingIOError:
|
||||
yield False
|
||||
return
|
||||
handle.seek(0)
|
||||
handle.truncate()
|
||||
handle.write(f"{os.getpid()} {datetime.utcnow().isoformat()}Z\n")
|
||||
handle.flush()
|
||||
yield True
|
||||
finally:
|
||||
try:
|
||||
fcntl.flock(handle, fcntl.LOCK_UN)
|
||||
except OSError:
|
||||
pass
|
||||
handle.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# YAML / frontmatter parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -1591,6 +1626,7 @@ def _report_needs_action(
|
||||
def fix_all_remote(
|
||||
api_base: str,
|
||||
no_writeback: bool = False,
|
||||
max_seconds: int = DEFAULT_REMOTE_ALL_MAX_SECONDS,
|
||||
) -> list[ConsistencyReport]:
|
||||
"""Pull-then-fix all registered repos that need attention.
|
||||
|
||||
@@ -1608,12 +1644,19 @@ def fix_all_remote(
|
||||
print("ERROR: Could not fetch repos from state-hub API", file=sys.stderr)
|
||||
return []
|
||||
|
||||
started = time.monotonic()
|
||||
reports: list[ConsistencyReport] = []
|
||||
skipped_clean: list[str] = []
|
||||
skipped_missing: list[str] = []
|
||||
skipped_budget: list[str] = []
|
||||
|
||||
for repo in repos:
|
||||
slug = repo["slug"]
|
||||
if max_seconds > 0 and time.monotonic() - started > max_seconds:
|
||||
skipped_budget.append(slug)
|
||||
skipped_budget.extend(r.get("slug", "?") for r in repos[repos.index(repo) + 1:])
|
||||
break
|
||||
|
||||
# Resolve path using the same priority as check_repo
|
||||
path = resolve_repo_path(repo)
|
||||
if not path or not Path(path).is_dir():
|
||||
@@ -1646,7 +1689,12 @@ def fix_all_remote(
|
||||
print(f" CLEAN (skipped): {', '.join(skipped_clean)}")
|
||||
if skipped_missing:
|
||||
print(f" NOT ON THIS HOST (skipped): {', '.join(skipped_missing)}")
|
||||
if skipped_clean or skipped_missing:
|
||||
if skipped_budget:
|
||||
print(
|
||||
f" BUDGET EXHAUSTED after {max_seconds}s (skipped): "
|
||||
f"{', '.join(skipped_budget)}"
|
||||
)
|
||||
if skipped_clean or skipped_missing or skipped_budget:
|
||||
print()
|
||||
|
||||
return reports
|
||||
@@ -1803,6 +1851,9 @@ def main() -> None:
|
||||
help="Pull each repo before fixing; when used with --all, skips repos "
|
||||
"that are already clean (no actionable issues, not behind remote). "
|
||||
"Implies --fix.")
|
||||
parser.add_argument("--max-seconds", type=int, default=DEFAULT_REMOTE_ALL_MAX_SECONDS,
|
||||
help="Wall-clock budget for --remote --all before remaining repos are skipped "
|
||||
f"(default: {DEFAULT_REMOTE_ALL_MAX_SECONDS}; 0 disables)")
|
||||
parser.add_argument("--no-writeback", action="store_true", dest="no_writeback",
|
||||
help="Disable DB→file status writeback (C-15) while keeping other fixes")
|
||||
parser.add_argument("--archive-closed", action="store_true",
|
||||
@@ -1849,7 +1900,18 @@ def main() -> None:
|
||||
reports[0].fixes_applied.extend(f"archive: {m}" for m in moved)
|
||||
# --remote --all: smart pull+fix across all repos
|
||||
elif args.remote and args.all:
|
||||
reports = fix_all_remote(args.api_base, no_writeback=no_wb)
|
||||
with run_lock("consistency-remote-all") as acquired:
|
||||
if not acquired:
|
||||
print(
|
||||
"SKIP: another fix-consistency-remote --all run is already active",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(0)
|
||||
reports = fix_all_remote(
|
||||
args.api_base,
|
||||
no_writeback=no_wb,
|
||||
max_seconds=args.max_seconds,
|
||||
)
|
||||
if not reports:
|
||||
sys.exit(0)
|
||||
else:
|
||||
@@ -1915,6 +1977,8 @@ def main() -> None:
|
||||
|
||||
any_fail = any(r.failures for r in reports)
|
||||
any_warn = any(r.warnings for r in reports)
|
||||
if args.remote and args.all and not any_fail:
|
||||
sys.exit(0)
|
||||
sys.exit(1 if any_fail else 2 if any_warn else 0)
|
||||
|
||||
|
||||
|
||||
@@ -104,6 +104,10 @@ ExecStart=… consistency_check.py --remote --all
|
||||
2. Skips repos that are already clean (no issues, not behind, not ahead)
|
||||
3. For repos needing action: `git pull --ff-only` first, then `fix_repo()` (which ends with T04 push)
|
||||
|
||||
It also holds `/tmp/custodian-consistency-remote-all.lock` for the duration of
|
||||
the sweep and defaults to a 300-second wall-clock budget. These guards keep a
|
||||
slow or stalled sweep from overlapping with the next 15-minute timer activation.
|
||||
|
||||
Previously `--all --fix` was used, which skipped the pull step and the clean-repo skip logic.
|
||||
|
||||
---
|
||||
|
||||
Reference in New Issue
Block a user