feat(consistency): T04 push seal — closed-loop writeback for automated commits

Root cause of the 501-commit pile-up in inter-hub: fix_repo() created
git commits (brief updates, T03 writebacks) but never pushed them, so
the 15-minute timer accumulated local commits indefinitely. Once real
development landed on remote the repos diverged with no self-healing path.

Changes
-------
repo_sync.py (new module)
  Extracts all git lifecycle primitives: pull_ff, push_ff,
  count_remote_ahead (C-16 input), count_local_ahead (C-17/T04 input).
  Module docstring documents the push-seal invariant and stable state.

consistency_check.py
  - Imports primitives from repo_sync; thin _detect_behind_remote wrapper
    preserves backward compat for existing callers and tests.
  - C-17 backlog guard: if local has unpushed commits from a prior failed
    push, retry before making more; skip all writes if push still fails.
  - T04 push seal: unconditional push_ff() at end of every fix_repo() run.
  - _report_needs_action: ahead_of_remote param so repos with unpushed
    backlogs are not silently skipped as "clean" by fix_all_remote().
  - Domain-slug fallback: brief no longer degrades to "(unknown)" when all
    workplans are completed — falls back to any workstream for domain context.
  - Service switched from --all --fix to --remote --all (pulls before
    fixing, skips already-clean repos).

push-seal.md (new)
  Capability documentation: the problem, the invariant, all three checks
  (C-16/C-17/T04), stable-state description, API reference, and test map.

test_repo_sync.py (new, 32 tests)
  Full coverage of all four primitives via real git repos (tmp_path).
  Includes C-17 scenario, push-seal invariant, and four end-to-end
  loop-stability tests.

test_consistency_check.py
  Four new _report_needs_action cases for the ahead_of_remote parameter.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-04-21 01:43:40 +02:00
parent 34acc1cdcf
commit 6cbf2d2c56
5 changed files with 797 additions and 46 deletions

View File

@@ -21,6 +21,7 @@ Checks:
C-14 ghost-duplicate WARN No Active topic workstream with no repo_id matches a file-backed title — probable ghost from premature create_workstream() call
C-15 task-db-ahead WARN Yes DB task status is ahead of file — regression prevented; writeback syncs file
C-16 repo-behind-remote WARN No Local repo is behind remote tracking branch — --fix skipped to avoid clobbering remote progress
C-17 repo-ahead-push-failed WARN No Local repo has unpushed commits and push failed — writes skipped to prevent runaway divergence
Usage:
python scripts/consistency_check.py --repo SLUG [--fix] [--no-writeback] [--json] [--api-base URL]
@@ -826,52 +827,26 @@ def _check_ghost_duplicates(
# ---------------------------------------------------------------------------
# Git helpers (T02 pull gate, T03 writeback)
# Git sync (T02T04: pull gate, writeback, push seal) — see repo_sync.py
# ---------------------------------------------------------------------------
# repo_sync.py owns the push-seal invariant and all git lifecycle primitives.
# The aliases below keep internal call sites and existing tests unchanged.
def _git_pull(repo_path: str) -> tuple[bool, str]:
"""Run ``git pull --ff-only`` on *repo_path*.
Returns ``(success, message)`` where *message* describes the outcome.
Never raises — errors are returned as ``(False, "<reason>")``.
"""
try:
result = subprocess.run(
["git", "-C", repo_path, "pull", "--ff-only"],
capture_output=True, text=True, timeout=30,
)
if result.returncode == 0:
out = result.stdout.strip()
return True, out or "already up to date"
return False, result.stderr.strip() or "pull failed"
except Exception as exc:
return False, str(exc)
from repo_sync import ( # noqa: E402 (import after top-level imports intentional)
count_local_ahead as _detect_ahead_of_remote,
count_remote_ahead as _count_remote_ahead,
pull_ff as _git_pull,
push_ff as _git_push,
)
def _detect_behind_remote(repo_path: str) -> bool:
"""Return True if the remote tracking branch has commits the local repo lacks.
"""True if remote has commits the local repo lacks (C-16 predicate).
"Ahead" (local has unpushed commits) is NOT considered behind.
"Diverged" is treated as behind (remote progress could be lost).
Best-effort: returns False on any error (offline, no remote, etc.) so that
check-only mode is never blocked by network issues.
Delegates to repo_sync.count_remote_ahead, which fetches before counting.
Returns False on any error so C-16 is never spuriously triggered.
"""
try:
subprocess.run(
["git", "-C", repo_path, "fetch", "--quiet", "origin"],
capture_output=True, timeout=15,
)
# Count commits in remote that are not in local.
# git rev-list HEAD..@{u} → commits remote has that local lacks.
result = subprocess.run(
["git", "-C", repo_path, "rev-list", "--count", "HEAD..@{u}"],
capture_output=True, text=True, timeout=5,
)
if result.returncode != 0:
return False # no upstream configured or other error
return int(result.stdout.strip() or "0") > 0
except Exception:
return False
return _count_remote_ahead(repo_path) > 0
def _patch_task_status_in_file(
@@ -970,10 +945,15 @@ def _write_custodian_brief(api_base: str, repo_slug: str, repo_path: str) -> boo
repo_id: str = repo.get("id", "")
domain_slug: str = ""
# Resolve domain slug via the topic linked to any workstream
# Resolve domain slug: prefer active workstreams, fall back to any workstream
# so that a fully-completed repo doesn't degrade to "(unknown)".
workstreams = _api_get(api_base, "/workstreams", {"repo_id": repo_id, "status": "active"}) or []
if isinstance(workstreams, list) and workstreams:
topic = _api_get(api_base, f"/topics/{workstreams[0].get('topic_id', '')}")
_ws_for_domain = workstreams if (isinstance(workstreams, list) and workstreams) else []
if not _ws_for_domain:
all_ws = _api_get(api_base, "/workstreams", {"repo_id": repo_id}) or []
_ws_for_domain = all_ws if isinstance(all_ws, list) else []
if _ws_for_domain:
topic = _api_get(api_base, f"/topics/{_ws_for_domain[0].get('topic_id', '')}")
if topic:
domain_slug = topic.get("domain_slug", "")
@@ -1128,6 +1108,28 @@ def fix_repo(
)
return report
# C-17: backlog guard — if local has unpushed commits from a prior failed push,
# try to push them before making more. Skipping writes prevents runaway divergence.
if repo_path:
ahead = _detect_ahead_of_remote(repo_path)
if ahead > 0:
push_ok, push_msg = _git_push(repo_path)
if not push_ok:
report.add(
severity="WARN", check_id="C-17",
message=(
f"Repo '{repo_slug}' has {ahead} unpushed commit(s) and push "
f"failed ({push_msg}) — skipping writes to prevent runaway divergence"
),
fixable=False,
)
report.fixes_applied.append(
"C-17: all write operations skipped — unpushed commits, push failed"
)
return report
# Push succeeded — local is now in sync; proceed normally
report.fixes_applied.append(f"C-17 cleared: pushed {ahead} backlogged commit(s)")
fixable = [i for i in report.issues if i.fixable]
for issue in fixable:
@@ -1339,6 +1341,16 @@ def fix_repo(
if brief_written:
report.fixes_applied.append("brief: .custodian-brief.md updated")
# Push all commits made this run (writebacks + brief) to close the loop.
# This ensures the next run starts with local == remote, making the
# timer idempotent and preventing commit pile-up.
if repo_path:
push_ok, push_msg = _git_push(repo_path)
if push_ok:
report.fixes_applied.append(f"push: {push_msg}")
else:
report.fixes_applied.append(f"push WARN: {push_msg}")
return report
@@ -1348,15 +1360,18 @@ def fix_repo(
_BACKGROUND_CHECKS: frozenset[str] = frozenset({"C-08"})
def _report_needs_action(report: ConsistencyReport, behind_remote: bool) -> bool:
def _report_needs_action(
report: ConsistencyReport, behind_remote: bool, ahead_of_remote: int = 0
) -> bool:
"""Return True if the repo warrants a pull+fix cycle.
A repo is considered clean (no action needed) when:
- It is not behind its remote tracking branch, AND
- It has no unpushed local commits (from a prior failed push), AND
- It has no FAIL issues, AND
- Every WARN/INFO issue is in the background-noise set (C-08).
"""
if behind_remote:
if behind_remote or ahead_of_remote > 0:
return True
if report.failures:
return True
@@ -1399,11 +1414,14 @@ def fix_all_remote(
skipped_missing.append(slug)
continue
# Read-only pass: detect issues and remote staleness
# Read-only pass: detect issues and remote staleness.
# _detect_behind_remote does a fetch, so _detect_ahead_of_remote
# after it sees an up-to-date tracking branch.
pre_report = check_repo(api_base, slug)
behind = _detect_behind_remote(path)
ahead = _detect_ahead_of_remote(path)
if not _report_needs_action(pre_report, behind):
if not _report_needs_action(pre_report, behind, ahead):
skipped_clean.append(slug)
continue

156
scripts/push-seal.md Normal file
View File

@@ -0,0 +1,156 @@
# Push Seal (T04)
**Module:** `repo_sync.py`
**Check IDs:** C-16 (pull gate), C-17 (backlog guard), T04 (push seal)
**Tests:** `tests/test_repo_sync.py`
---
## The problem it solves
The consistency engine auto-commits two kinds of changes to managed repos:
- **Writebacks (T03):** task status patched into workplan files when the DB is ahead of the file
- **Brief updates:** `.custodian-brief.md` regenerated when workstream progress changes
Before T04, these commits were created but never pushed. The `custodian-sync.service` timer fires every 15 minutes. Each firing that detected any change would create a new commit. Over time — especially when a repo had no active workstreams and the brief kept oscillating — hundreds of auto-commits piled up locally while the remote received none. When real development commits landed on the remote, the repo became diverged and unrecoverable without manual intervention.
**Root causes:**
1. Commits were made without a corresponding push — an open loop.
2. C-16 (pull gate) only fires when remote is *ahead* of local, not when local is ahead. Once the first auto-commit ran, local was already ahead and C-16 never fired again.
3. The `.custodian-brief.md` domain-slug lookup produced `(unknown)` for fully-completed repos (no active workstreams → no topic lookup → empty string), causing the brief to be considered "changed" on every run.
---
## The invariant
> Every `fix_repo()` run that creates commits must push them to remote before returning, so the next run starts with `local ≡ remote`.
When this holds, the timer loop is **idempotent**: a clean repo with no unpushed commits is detected as clean and skipped entirely — no git activity, no divergence.
---
## Three interlocking checks
### C-16 — pull gate (T02, pre-existing)
Fires when `count_remote_ahead(repo_path) > 0`.
Remote has commits local lacks. Writing more commits on top would lose remote progress. All write operations are skipped; the `--remote` flag in the service pulls before fixing.
### C-17 — backlog guard (new)
Fires when `count_local_ahead(repo_path) > 0` AND `push_ff()` fails.
Local has unpushed commits from a prior run where the push failed. Before making more commits this run, the guard attempts to push the backlog. If push fails (e.g. the repo has diverged), all write operations for this run are skipped. This prevents the commit count from growing further during an already-broken state.
If push succeeds, the guard clears itself (`"C-17 cleared: pushed N backlogged commit(s)"`) and the run proceeds normally.
### T04 — push seal (new)
At the end of every `fix_repo()` run, after all writebacks and the brief update, `push_ff()` is called unconditionally.
```
fix_repo()
├── C-16 check → skip all writes if behind remote
├── C-17 guard → retry backlog push; skip all writes if push still fails
├── apply fixable issues (C-04, C-05, C-09, C-10, C-11, C-12, C-13, C-15)
├── _write_custodian_brief() → commit if content changed
└── push_ff() ← T04: seal the loop
```
A push on a repo with no new commits is a no-op that returns success. So T04 adds no overhead for clean runs.
---
## Stable state
Once T04 is in effect, the timer converges to this stable state:
```
repo clean AND local ≡ remote
→ _report_needs_action(report, behind=0, ahead=0) → False
→ repo skipped (logged as CLEAN)
→ no git activity
→ state unchanged on next fire
```
The only way to leave stable state is:
- A real fix is needed (task drift, brief content change, etc.) → run completes with a push → returns to stable state
- Remote receives new commits → C-16 pulls → returns to stable state
- Push fails (network, diverge) → C-17 fires next run → retries push → returns to stable state or escalates to a visible WARN
---
## Domain-slug fallback (related fix)
The brief regeneration was also fixed to avoid oscillating between `inter_hub` and `(unknown)` for repos with no active workstreams.
**Before:** domain slug was resolved only via active workstreams. A repo with all workplans completed had no active workstreams → topic lookup returned nothing → `domain_slug = ""` → brief rendered `(unknown)` → brief was considered "changed" on every run → spurious commit every 15 minutes.
**After:** if no active workstreams exist, the lookup falls back to any workstream (completed or archived) on the same repo. Domain context is preserved permanently.
---
## Service configuration
```ini
# /home/worsch/.config/systemd/user/custodian-sync.service
ExecStart=… consistency_check.py --remote --all
```
`--remote --all` uses `fix_all_remote()`, which:
1. Runs `check_repo()` + `count_remote_ahead()` + `count_local_ahead()` for each repo
2. Skips repos that are already clean (no issues, not behind, not ahead)
3. For repos needing action: `git pull --ff-only` first, then `fix_repo()` (which ends with T04 push)
Previously `--all --fix` was used, which skipped the pull step and the clean-repo skip logic.
---
## Public API (`repo_sync.py`)
| Function | Returns | Purpose |
|---|---|---|
| `pull_ff(repo_path)` | `(bool, str)` | `git pull --ff-only`; T02 pull gate |
| `push_ff(repo_path)` | `(bool, str)` | `git push`; T04 push seal |
| `count_remote_ahead(repo_path)` | `int` | commits remote has that local lacks; C-16 input |
| `count_local_ahead(repo_path)` | `int` | commits local has that remote lacks; C-17 / T04 input |
All functions are **best-effort**: errors return `0` / `(False, "…")` rather than raising. The consistency engine must never be blocked by transient network issues.
Note: `push_ff` does not pass `--ff-only` to git — that flag applies to `git pull`, not `git push`. `git push` already rejects non-fast-forward updates by default. The function name signals the semantic guarantee (no force-push), not a CLI flag.
---
## Compatibility aliases in `consistency_check.py`
The original inline git functions have been replaced by imports from `repo_sync`, with thin wrappers for backward compatibility:
```python
from repo_sync import count_local_ahead, count_remote_ahead, pull_ff, push_ff
_detect_ahead_of_remote = count_local_ahead
_git_pull = pull_ff
_git_push = push_ff
def _detect_behind_remote(repo_path: str) -> bool:
return count_remote_ahead(repo_path) > 0
```
Existing code and tests that import `_detect_behind_remote` or `_git_pull` from `consistency_check` continue to work unchanged.
---
## Test coverage (`tests/test_repo_sync.py`)
All tests use real git repos via `tmp_path` — no mocks, no subprocess patches. A `git_pair` fixture provides a bare remote + one clone with an initial pushed commit. A `_make_second_clone` helper creates a second worker clone for divergence scenarios.
| Class | What it covers |
|---|---|
| `TestErrorResilience` | All four functions return safe defaults for non-git paths, non-existent paths, and repos with no upstream |
| `TestCountLocalAhead` | 0 when in sync; N after N local commits; 0 after push |
| `TestCountRemoteAhead` | 0 when in sync; 0 when local is ahead (not behind); N when remote has N new commits; nonzero when diverged |
| `TestPushFf` | Success when ahead; no-op success when in sync; push-seal invariant (ahead=0 after push); idempotent; fails when diverged; C-17 scenario (diverged push rejection does not grow the backlog) |
| `TestPullFf` | Up-to-date no-op; pulls new remote commits; resolves behind state to 0; fails when diverged; fails with no remote |
| `TestPushSealLoop` | Full fix-run cycle leaves repo clean; multiple writebacks sealed in one push; no-commit run needs no push; runaway prevention on failed push |

118
scripts/repo_sync.py Normal file
View File

@@ -0,0 +1,118 @@
"""repo_sync.py — T04 Push Seal: closed-loop git sync for consistency runs.
The **push-seal invariant**
---------------------------
Every ``fix_repo()`` run that creates commits must push them to remote before
returning. This guarantees the next run starts with ``local ≡ remote``,
making the 15-minute timer idempotent and preventing the open-loop commit
pile-up that occurs when automated writebacks accumulate without being pushed.
Three checks enforce the invariant end-to-end:
C-16 repo-behind-remote local is behind remote; skip writes, pull first
C-17 repo-ahead-push-failed local has unpushed commits and push failed; skip writes
T04 push-seal at end of every fix run, push all commits made this run
Stable state the timer converges to
-------------------------------------
repo clean AND local ≡ remote
→ ``_report_needs_action`` returns False
→ repo skipped entirely
→ no git activity, no divergence
Public API
----------
pull_ff(repo_path) git pull --ff-only → (ok: bool, message: str)
push_ff(repo_path) git push --ff-only → (ok: bool, message: str)
count_remote_ahead(path) commits remote has that local lacks (C-16 input)
count_local_ahead(path) commits local has that remote lacks (C-17 / T04 input)
All functions are **best-effort**: errors return 0 / (False, "") rather than
raising, so the consistency engine is never blocked by transient network issues.
The caller decides whether a 0 / False result is a problem.
"""
from __future__ import annotations
import subprocess
def pull_ff(repo_path: str) -> tuple[bool, str]:
"""Run ``git pull --ff-only`` on *repo_path*.
Returns ``(success, message)`` describing the outcome. Never raises.
"""
try:
result = subprocess.run(
["git", "-C", repo_path, "pull", "--ff-only"],
capture_output=True, text=True, timeout=30,
)
if result.returncode == 0:
return True, result.stdout.strip() or "already up to date"
return False, result.stderr.strip() or "pull failed"
except Exception as exc:
return False, str(exc)
def push_ff(repo_path: str) -> tuple[bool, str]:
"""Run ``git push`` on *repo_path*.
``git push`` already rejects non-fast-forward updates by default (the
remote refuses them), so no ``--force`` flag is needed or used. The
function name signals the *semantic* guarantee — we never force-push —
not a literal CLI flag.
Returns ``(success, message)`` describing the outcome. Never raises.
"""
try:
result = subprocess.run(
["git", "-C", repo_path, "push"],
capture_output=True, text=True, timeout=30,
)
if result.returncode == 0:
return True, result.stdout.strip() or "pushed"
return False, result.stderr.strip() or "push failed"
except Exception as exc:
return False, str(exc)
def count_remote_ahead(repo_path: str) -> int:
"""Return how many commits remote has that local lacks (the *behind* count).
Runs ``git fetch --quiet origin`` first so the result reflects the actual
current remote state, not a stale tracking branch. Returns 0 on any
error (network down, no upstream configured, etc.) so C-16 is never
spuriously triggered by connectivity issues.
"""
try:
subprocess.run(
["git", "-C", repo_path, "fetch", "--quiet", "origin"],
capture_output=True, timeout=15,
)
result = subprocess.run(
["git", "-C", repo_path, "rev-list", "--count", "HEAD..@{u}"],
capture_output=True, text=True, timeout=5,
)
if result.returncode != 0:
return 0
return int(result.stdout.strip() or "0")
except Exception:
return 0
def count_local_ahead(repo_path: str) -> int:
"""Return how many commits local has that remote lacks (the *ahead* count).
Uses the already-fetched tracking branch — call after ``count_remote_ahead``
(which runs fetch) for the most accurate result, or accept a one-run lag.
Returns 0 on any error so C-17 is never spuriously triggered.
"""
try:
result = subprocess.run(
["git", "-C", repo_path, "rev-list", "--count", "@{u}..HEAD"],
capture_output=True, text=True, timeout=5,
)
if result.returncode != 0:
return 0
return int(result.stdout.strip() or "0")
except Exception:
return 0