Add Railiance Stage 1 run command

2026-06-27 16:24:17 +02:00
parent f7c8670d19
commit f68e1b36da
7 changed files with 376 additions and 5 deletions
--- a/bin/railiance
+++ b/bin/railiance
@@ -17,6 +17,7 @@ Commands:
  cloudinit     Emit minimal cloud-init user-data
  init-repo     Idempotently furnish repo housekeeping
  create-overlay Scaffold a Railiance overlay repo for an upstream app
+  run           Run Stage 1 local validation from railiance/app.toml
  build-spore   Build a distributable "Spore" bundle
  seed-local    Run the seed script on this machine
  checklist     Pre-VM checklist
@@ -41,6 +42,7 @@ case "$cmd" in
  cloudinit) cat "$ROOT/cloudinit/user-data.yaml" ;;
  init-repo) bash "$ROOT/tools/furnish_railiance_repo.sh" ;;
  create-overlay) bash "$ROOT/tools/create_railiance_overlay_repo.sh" "$@" ;;
+  run) exec railiance-run "$@" ;;
  build-spore) bash "$ROOT/tools/build_spore.sh" ;;
  seed-local) bash "$ROOT/tools/seed_node.sh" ;;
  checklist)
--- a/docs/README.md
+++ b/docs/README.md
@@ -76,6 +76,7 @@ From two bare Linux servers, a Git repo, and valid credentials, you can rebuild
 - [Deployment lifecycle](deployment-lifecycle.md)
 - [Railiance app.toml contract](app-toml-contract.md)
 - [Railiance overlay repo pattern](overlay-repo-pattern.md)
+- [Railiance run command](railiance-run-command.md)

 ## 👥 Contributing

--- a/docs/railiance-run-command.md
+++ b/docs/railiance-run-command.md
@@ -0,0 +1,52 @@
+# Railiance Run Command
+
+`bin/railiance run` executes Stage 1 local validation for a repository that
+contains `railiance/app.toml`.
+
+The command is intentionally local and conservative:
+
+- reads `railiance/app.toml` using the `railiance.app.v1` contract;
+- runs `[stages.stage1].commands` from the app directory;
+- evaluates Stage 1 check ids listed in `[stages.stage1].checks` when they can
+  be checked locally;
+- emits a machine-readable `railiance.run-result.v1` JSON result;
+- records command references, exit codes, durations, and output byte counts,
+  but not shell text or command stdout/stderr content;
+- strips credentials, query strings, and fragments from URLs before reporting HTTP
+  check results.
+
+## Usage
+
+```bash
+bin/railiance run /path/to/app-or-overlay --pretty
+bin/railiance run . --json-out .railiance/stage1-result.json
+```
+
+The process exits `0` only when all Stage 1 commands and required checks pass.
+Optional checks may be skipped without failing the run. For example, an optional
+local health endpoint can be declared before a local server command exists.
+
+## Supported Local Checks
+
+- `command`: runs the check `run` command in the app directory.
+- `http`: calls the declared URL and compares the HTTP status.
+- `helm`: runs `helm template` when Helm is installed. Required Helm checks fail
+  if Helm is unavailable; optional Helm checks are skipped.
+
+Other check types are reported as skipped or failed depending on whether the
+check is required. Stage 2 and Stage 3 checks are never executed by
+`railiance run`.
+
+## Result Shape
+
+The JSON result includes:
+
+- app identity and source revision;
+- contract path and app directory;
+- command/check status summaries using contract references instead of raw shell
+  commands;
+- expected evidence labels from Stage 1;
+- timing and exit status metadata.
+
+The result is suitable for later promotion gates and State Hub progress notes,
+without embedding secrets or verbose logs.
--- a/tools/README_tools.md
+++ b/tools/README_tools.md
@@ -59,6 +59,10 @@ This model emphasizes:

 ---

+### `railiance-run`
+- Executes Stage 1 local validation from `railiance/app.toml`.
+- Emits a `railiance.run-result.v1` JSON result without command logs or secrets.
+
 ### `create_railiance_overlay_repo.sh`
 - Scaffolds a local Railiance overlay repo for a third-party upstream app.
 - Records upstream identity without vendoring upstream code.
--- a/tools/cmd/railiance-run
+++ b/tools/cmd/railiance-run
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+"""Railiance Stage 1 local validation command."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+import subprocess
+import sys
+import time
+import tomllib
+import urllib.error
+import urllib.request
+import urllib.parse
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+SUPPORTED_SCHEMA = "railiance.app.v1"
+
+
+def utc_now() -> str:
+    return datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z")
+
+
+def load_contract(app_dir: Path) -> tuple[Path, dict[str, Any]]:
+    path = app_dir / "railiance" / "app.toml"
+    if not path.exists():
+        raise SystemExit(f"Missing Railiance contract: {path}")
+    with path.open("rb") as handle:
+        data = tomllib.load(handle)
+    if data.get("schema_version") != SUPPORTED_SCHEMA:
+        raise SystemExit(
+            f"Unsupported schema_version {data.get('schema_version')!r}; expected {SUPPORTED_SCHEMA}"
+        )
+    return path, data
+
+
+def command_result(
+    command: str, cwd: Path, timeout_seconds: int | None, command_ref: str
+) -> dict[str, Any]:
+    started = time.monotonic()
+    timeout = timeout_seconds or 900
+    try:
+        completed = subprocess.run(
+            command,
+            cwd=cwd,
+            shell=True,
+            text=True,
+            capture_output=True,
+            timeout=timeout,
+            check=False,
+        )
+        status = "passed" if completed.returncode == 0 else "failed"
+        return {
+            "command_ref": command_ref,
+            "status": status,
+            "exit_code": completed.returncode,
+            "duration_seconds": round(time.monotonic() - started, 3),
+            "stdout_bytes": len(completed.stdout.encode()),
+            "stderr_bytes": len(completed.stderr.encode()),
+        }
+    except subprocess.TimeoutExpired as exc:
+        return {
+            "command_ref": command_ref,
+            "status": "failed",
+            "exit_code": None,
+            "duration_seconds": round(time.monotonic() - started, 3),
+            "error": f"timeout after {timeout}s",
+            "stdout_bytes": len((exc.stdout or "").encode()) if isinstance(exc.stdout, str) else 0,
+            "stderr_bytes": len((exc.stderr or "").encode()) if isinstance(exc.stderr, str) else 0,
+        }
+
+
+def check_required(check: dict[str, Any]) -> bool:
+    return bool(check.get("required", True))
+
+
+def skipped(check: dict[str, Any], reason: str) -> dict[str, Any]:
+    required = check_required(check)
+    return {
+        "id": check.get("id"),
+        "type": check.get("type"),
+        "required": required,
+        "status": "failed" if required else "skipped",
+        "reason": reason,
+    }
+
+
+def scrub_url(url: str) -> str:
+    try:
+        parts = urllib.parse.urlsplit(url)
+    except ValueError:
+        return "<invalid-url>"
+    netloc = parts.netloc.rsplit("@", 1)[-1]
+    return urllib.parse.urlunsplit((parts.scheme, netloc, parts.path, "", ""))
+
+
+def run_http_check(check: dict[str, Any]) -> dict[str, Any]:
+    started = time.monotonic()
+    url = str(check.get("url", ""))
+    timeout = int(check.get("timeout_seconds", 10))
+    expected_status = int(check.get("expected_status", 200))
+    required = check_required(check)
+    try:
+        with urllib.request.urlopen(url, timeout=timeout) as response:
+            status_code = response.getcode()
+    except (urllib.error.URLError, TimeoutError, ValueError) as exc:
+        return {
+            "id": check.get("id"),
+            "type": "http",
+            "required": required,
+            "status": "failed" if required else "skipped",
+            "url": scrub_url(url),
+            "duration_seconds": round(time.monotonic() - started, 3),
+            "reason": str(exc),
+        }
+    status = "passed" if status_code == expected_status else "failed"
+    return {
+        "id": check.get("id"),
+        "type": "http",
+        "required": required,
+        "status": status if required or status == "passed" else "skipped",
+        "url": scrub_url(url),
+        "expected_status": expected_status,
+        "actual_status": status_code,
+        "duration_seconds": round(time.monotonic() - started, 3),
+    }
+
+
+def run_helm_check(check: dict[str, Any], app_dir: Path, release: str) -> dict[str, Any]:
+    if shutil.which("helm") is None:
+        return skipped(check, "helm is not installed")
+    chart = str(check.get("chart", ""))
+    values = str(check.get("values", ""))
+    mode = str(check.get("mode", "template"))
+    if mode not in {"template", "server-dry-run"}:
+        return skipped(check, f"unsupported helm mode for Stage 1: {mode}")
+    command = f"helm template {release} {chart}"
+    if values:
+        command += f" -f {values}"
+    result = command_result(
+        command, app_dir, int(check.get("timeout_seconds", 120)), f"checks.{check.get('id')}.helm"
+    )
+    return {
+        "id": check.get("id"),
+        "type": "helm",
+        "required": check_required(check),
+        "status": result["status"],
+        "mode": mode,
+        "command_ref": result.get("command_ref"),
+        "exit_code": result.get("exit_code"),
+        "duration_seconds": result.get("duration_seconds"),
+        "stdout_bytes": result.get("stdout_bytes"),
+        "stderr_bytes": result.get("stderr_bytes"),
+    }
+
+
+def run_check(check: dict[str, Any], app_dir: Path, release: str) -> dict[str, Any]:
+    check_type = check.get("type")
+    if check.get("stage") != "stage1":
+        return skipped(check, "not a Stage 1 check")
+    if check_type == "command":
+        command = str(check.get("run", ""))
+        if not command:
+            return skipped(check, "command check has no run field")
+        result = command_result(
+            command, app_dir, int(check.get("timeout_seconds", 900)), f"checks.{check.get('id')}.command"
+        )
+        return {
+            "id": check.get("id"),
+            "type": "command",
+            "required": check_required(check),
+            **result,
+        }
+    if check_type == "http":
+        return run_http_check(check)
+    if check_type == "helm":
+        return run_helm_check(check, app_dir, release)
+    if check_type == "manual":
+        return skipped(check, "manual check cannot be satisfied by railiance run")
+    return skipped(check, f"unsupported local check type: {check_type}")
+
+
+def required_failures(items: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    return [item for item in items if item.get("required", True) and item.get("status") != "passed"]
+
+
+def build_result(app_dir: Path, contract_path: Path, data: dict[str, Any]) -> dict[str, Any]:
+    stage = data.get("stages", {}).get("stage1", {})
+    if not stage.get("enabled", False):
+        raise SystemExit("Stage 1 is disabled in railiance/app.toml")
+
+    app = data.get("app", {})
+    source = data.get("source", {})
+    started_at = utc_now()
+    started_monotonic = time.monotonic()
+
+    stage_commands = list(stage.get("commands", []))
+    command_results = [
+        command_result(command, app_dir, None, f"stages.stage1.commands[{index}]")
+        for index, command in enumerate(stage_commands)
+    ]
+
+    check_ids = list(stage.get("checks", []))
+    all_checks = {check.get("id"): check for check in data.get("checks", [])}
+    check_results = []
+    for check_id in check_ids:
+        check = all_checks.get(check_id)
+        if check is None:
+            check_results.append(
+                {
+                    "id": check_id,
+                    "type": None,
+                    "required": True,
+                    "status": "failed",
+                    "reason": "check id is referenced by Stage 1 but not defined",
+                }
+            )
+            continue
+        check_results.append(run_check(check, app_dir, str(stage.get("release", app.get("id", "app")))))
+
+    command_failures = [item for item in command_results if item.get("status") != "passed"]
+    check_failures = required_failures(check_results)
+    status = "passed" if not command_failures and not check_failures else "failed"
+
+    return {
+        "schema_version": "railiance.run-result.v1",
+        "status": status,
+        "stage": "stage1",
+        "started_at": started_at,
+        "finished_at": utc_now(),
+        "duration_seconds": round(time.monotonic() - started_monotonic, 3),
+        "app": {
+            "id": app.get("id"),
+            "name": app.get("name"),
+            "repo": app.get("repo"),
+            "owner": app.get("owner"),
+            "criticality": app.get("criticality"),
+        },
+        "source": {
+            "revision": source.get("revision"),
+            "artifact": source.get("artifact"),
+            "digest_policy": source.get("digest_policy"),
+        },
+        "contract": str(contract_path),
+        "app_dir": str(app_dir),
+        "release": stage.get("release"),
+        "namespace": stage.get("namespace"),
+        "requires_approval": bool(stage.get("requires_approval", False)),
+        "evidence_expected": list(stage.get("evidence", [])),
+        "commands": command_results,
+        "checks": check_results,
+        "summary": {
+            "commands_total": len(command_results),
+            "commands_failed": len(command_failures),
+            "checks_total": len(check_results),
+            "required_checks_failed": len(check_failures),
+        },
+    }
+
+
+def parse_args(argv: list[str]) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run Railiance Stage 1 local validation from railiance/app.toml."
+    )
+    parser.add_argument(
+        "app_dir",
+        nargs="?",
+        default=".",
+        help="Application or overlay repository directory (default: current directory).",
+    )
+    parser.add_argument(
+        "--json-out",
+        help="Optional path to write the machine-readable run result.",
+    )
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print JSON output to stdout.",
+    )
+    return parser.parse_args(argv)
+
+
+def main(argv: list[str]) -> int:
+    args = parse_args(argv)
+    app_dir = Path(args.app_dir).resolve()
+    contract_path, data = load_contract(app_dir)
+    result = build_result(app_dir, contract_path, data)
+    rendered = json.dumps(result, indent=2 if args.pretty else None, sort_keys=True)
+    print(rendered)
+    if args.json_out:
+        output_path = Path(args.json_out)
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(rendered + "\n", encoding="utf-8")
+    return 0 if result["status"] == "passed" else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))
--- a/tools/create_railiance_overlay_repo.sh
+++ b/tools/create_railiance_overlay_repo.sh
@@ -178,8 +178,8 @@ enabled = true
 namespace = "local"
 release = "${APP_ID}-local"
 commands = ["./tests/stage1.sh"]
-checks = ["helm-template", "local-health"]
-evidence = ["helm template success", "local health check or explicit not-run note"]
+checks = ["stage1-script", "local-health"]
+evidence = ["Stage 1 script result", "local health check or explicit not-run note"]
 requires_approval = false

 [stages.stage2]
@@ -204,12 +204,21 @@ requires_approval = true
 promotion_mode = "release-replace"
 previous_stable = "helm:${APP_ID}:previous"

+[[checks]]
+id = "stage1-script"
+type = "command"
+stage = "stage1"
+description = "Run generated Stage 1 validation script."
+required = true
+run = "./tests/stage1.sh"
+timeout_seconds = 300
+
 [[checks]]
 id = "helm-template"
 type = "helm"
 stage = "stage1"
-description = "Render Helm templates locally."
-required = true
+description = "Render Helm templates locally when Helm is available."
+required = false
 chart = "charts/${APP_ID}"
 values = "values/stage1.yaml"
 mode = "template"
--- a/workplans/RAIL-BS-WP-0006-staged-promotion-lifecycle.md
+++ b/workplans/RAIL-BS-WP-0006-staged-promotion-lifecycle.md
@@ -135,7 +135,7 @@ logic into the upstream repository.

 ```task
 id: RAIL-BS-WP-0006-T04
-status: todo
+status: done
 priority: high
 state_hub_task_id: "95c3311b-04bb-4c83-bda3-47958217b665"
 ```
@@ -152,6 +152,8 @@ Expected behavior:

 **Done when:** at least one representative app can complete Stage 1 locally.

+2026-06-27: Added `tools/cmd/railiance-run`, the `bin/railiance run` dispatcher entry, and `docs/railiance-run-command.md`. The command reads `railiance/app.toml`, runs Stage 1 commands and local checks, and emits `railiance.run-result.v1` JSON without command logs or secret values. Updated the overlay generator so a generated Forgejo overlay completes Stage 1 locally in this environment; Helm rendering is optional when Helm is unavailable.
+
 ---

 ### T05 - Canary Helm chart template