state-hub/scripts/pull_image.py

#!/usr/bin/env python3
"""
Pull a Docker Hub image via the registry v2 API using Python's SSL (OpenSSL),
then import it via `docker load`. Bypasses Docker's Go TLS client entirely.

Usage: python pull_image.py <image:tag> [output.tar]
  e.g: python pull_image.py postgres:16-alpine postgres.tar
"""
import json
import os
import sys
import tarfile
import tempfile
import urllib.request
import urllib.error
import hashlib


def get_token(repo: str) -> str:
    url = f"https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repo}:pull"
    with urllib.request.urlopen(url, timeout=30) as r:
        return json.loads(r.read())["token"]


class _StripAuthOnRedirect(urllib.request.HTTPRedirectHandler):
    """Follow redirects but strip Authorization; keep Range and other headers."""
    def redirect_request(self, req, fp, code, msg, headers, newurl):
        new_req = urllib.request.Request(newurl)
        # Forward Range header (needed for chunked downloads) but NOT Authorization
        for hdr in ("Range",):
            val = req.get_header(hdr.capitalize())
            if val:
                new_req.add_header(hdr, val)
        return new_req


def _opener():
    return urllib.request.build_opener(_StripAuthOnRedirect())


def registry_get(url: str, token: str, headers: dict | None = None) -> bytes:
    """GET with Bearer auth; follows redirects WITHOUT auth (for S3/CDN blobs)."""
    req = urllib.request.Request(url, headers={"Authorization": f"Bearer {token}", **(headers or {})})
    with _opener().open(req, timeout=60) as r:
        return r.read()


def pull_image(image: str, output_tar: str) -> None:
    if ":" in image:
        repo_name, tag = image.rsplit(":", 1)
    else:
        repo_name, tag = image, "latest"

    if "/" not in repo_name:
        repo_name = f"library/{repo_name}"

    print(f"Authenticating for {repo_name}:{tag} ...")
    token = get_token(repo_name)

    # Fetch manifest (prefer OCI, fall back to v2 schema2)
    print("Fetching manifest ...")
    manifest_url = f"https://registry-1.docker.io/v2/{repo_name}/manifests/{tag}"
    manifest_bytes = registry_get(
        manifest_url,
        token,
        headers={"Accept": "application/vnd.docker.distribution.manifest.v2+json,"
                            "application/vnd.oci.image.manifest.v1+json"},
    )
    manifest = json.loads(manifest_bytes)

    # Handle manifest list (multi-arch) — pick linux/amd64
    media_type = manifest.get("mediaType", "") or manifest.get("schemaVersion", "")
    if "list" in str(media_type) or manifest.get("manifests"):
        print("Manifest list detected — selecting linux/amd64 ...")
        for m in manifest["manifests"]:
            plat = m.get("platform", {})
            if plat.get("os") == "linux" and plat.get("architecture") == "amd64":
                digest = m["digest"]
                manifest_bytes = registry_get(
                    f"https://registry-1.docker.io/v2/{repo_name}/manifests/{digest}",
                    token,
                    headers={"Accept": "application/vnd.docker.distribution.manifest.v2+json"},
                )
                manifest = json.loads(manifest_bytes)
                break

    config_digest = manifest["config"]["digest"]
    layers = manifest["layers"]

    with tempfile.TemporaryDirectory() as tmpdir:
        # Download config blob
        print("Downloading config ...")
        config_data = registry_get(
            f"https://registry-1.docker.io/v2/{repo_name}/blobs/{config_digest}",
            token,
        )
        config_filename = config_digest.replace("sha256:", "") + ".json"
        config_path = os.path.join(tmpdir, config_filename)
        with open(config_path, "wb") as f:
            f.write(config_data)

        # Download each layer
        layer_dirs = []
        for i, layer in enumerate(layers):
            digest = layer["digest"]
            size = layer["size"]
            short = digest[7:19]
            print(f"Downloading layer {i+1}/{len(layers)} ({short}..., {size//1024//1024}MB) ...")

            blob_url = f"https://registry-1.docker.io/v2/{repo_name}/blobs/{digest}"
            req = urllib.request.Request(blob_url, headers={"Authorization": f"Bearer {token}"})

            layer_dir = os.path.join(tmpdir, f"layer_{i}")
            os.makedirs(layer_dir)
            layer_tar = os.path.join(layer_dir, "layer.tar")
            version_file = os.path.join(layer_dir, "VERSION")
            json_file = os.path.join(layer_dir, "json")

            # Stream download with Range-request chunking so a TCP corruption
            # only loses one 2MB chunk, not the whole download.
            CHUNK_SIZE = 2 * 1024 * 1024  # 2MB per Range request
            downloaded = 0
            with open(layer_tar, "wb") as f:
                while downloaded < size:
                    end = min(downloaded + CHUNK_SIZE - 1, size - 1)
                    while True:
                        try:
                            range_req = urllib.request.Request(
                                blob_url,
                                headers={
                                    "Authorization": f"Bearer {token}",
                                    "Range": f"bytes={downloaded}-{end}",
                                },
                            )
                            with _opener().open(range_req, timeout=60) as resp:
                                data = resp.read()
                            break
                        except Exception as exc:
                            print(f"\r  retry at {downloaded//1024//1024}MB ({exc})...", end="", flush=True)
                            import time; time.sleep(1)
                    f.write(data)
                    downloaded += len(data)
                    pct = downloaded * 100 // size if size else 0
                    print(f"\r  {downloaded//1024//1024}MB / {size//1024//1024}MB ({pct}%)", end="", flush=True)
            print()

            with open(version_file, "w") as f:
                f.write("1.0")
            with open(json_file, "w") as f:
                json.dump({"id": digest.replace("sha256:", "")}, f)

            layer_dirs.append(f"layer_{i}/layer.tar")

        # Write manifest.json
        manifest_json = [
            {
                "Config": config_filename,
                "RepoTags": [f"{repo_name.replace('library/', '')}:{tag}"],
                "Layers": layer_dirs,
            }
        ]
        manifest_path = os.path.join(tmpdir, "manifest.json")
        with open(manifest_path, "w") as f:
            json.dump(manifest_json, f)

        # Bundle into tar
        print(f"Building {output_tar} ...")
        with tarfile.open(output_tar, "w") as tar:
            for name in [config_filename, "manifest.json"]:
                tar.add(os.path.join(tmpdir, name), arcname=name)
            for i in range(len(layers)):
                for fname in ["layer.tar", "VERSION", "json"]:
                    path = os.path.join(tmpdir, f"layer_{i}", fname)
                    tar.add(path, arcname=f"layer_{i}/{fname}")

    print(f"Done. Load with: docker load -i {output_tar}")


if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: pull_image.py <image:tag> [output.tar]")
        sys.exit(1)
    image = sys.argv[1]
    output = sys.argv[2] if len(sys.argv) > 2 else image.replace(":", "_").replace("/", "_") + ".tar"
    pull_image(image, output)