feat: snapshot/restore checkpoints (SAND-WP-0007)

Add workspace checkpoint API with SnapshotStore, extension hooks on
compose-ssh and saas-stub, manager orchestration, CLI/HTTP surface,
profile.compose-checkpoint, and docs/tests.
This commit is contained in:
2026-06-24 07:57:40 +02:00
parent 2760ef2373
commit 952cebf2e9
21 changed files with 966 additions and 34 deletions

View File

@@ -6,17 +6,20 @@ from sandboxer.extensions.registry import load_extension, resolve_backend
from sandboxer.lifecycle.state_hub import emit_lifecycle_event, event_type_for_state
from sandboxer.lifecycle.store import SandboxStore, utcnow
from sandboxer.models import (
Consumer,
MeterRecord,
Reachability,
SandboxCreateRequest,
SandboxState,
SandboxStatus,
SnapshotRecord,
)
from sandboxer.payments.credits import CreditsStore
from sandboxer.payments.metering import estimate_cost, settle_usage
from sandboxer.placement import resolve_host
from sandboxer.profiles.loader import load_profile
from sandboxer.routing.resolver import resolve_extension
from sandboxer.snapshots.store import SnapshotStore
from sandboxer.telemetry.export import export_telemetry
from sandboxer.telemetry.introspection import (
build_introspection_report,
@@ -30,9 +33,27 @@ class SandboxManager:
self,
store: SandboxStore | None = None,
credits: CreditsStore | None = None,
snapshots: SnapshotStore | None = None,
) -> None:
self.store = store or SandboxStore()
self.credits = credits or CreditsStore()
self.snapshots = snapshots or SnapshotStore()
@staticmethod
def _handle_from_status(status: SandboxStatus) -> dict[str, str]:
return {
"sandbox_id": status.sandbox_id,
"host": status.host or "",
"remote_dir": status.reachability.remote_dir if status.reachability else "",
"compose_project": status.reachability.compose_project if status.reachability else "",
"compose_file": status.inputs.get("compose_file", ""),
"ssh_user": status.inputs.get("ssh_user", ""),
"compose_cmd": status.inputs.get("compose_cmd", ""),
"ssh_port": status.inputs.get("ssh_port", ""),
"vm_target": status.inputs.get("vm_target", ""),
"vm_host": status.inputs.get("vm_host", ""),
"endpoint": status.inputs.get("endpoint", ""),
}
def _resolved_host(self, profile, extension, host_override: str | None) -> str:
if extension.capabilities.pricing_model == "metered":
@@ -157,19 +178,7 @@ class SandboxManager:
self.store.save(status)
emit_lifecycle_event(status, event_type=event_type_for_state(status.state))
handle = {
"sandbox_id": status.sandbox_id,
"host": status.host or "",
"remote_dir": status.reachability.remote_dir if status.reachability else "",
"compose_project": status.reachability.compose_project if status.reachability else "",
"compose_file": status.inputs.get("compose_file", ""),
"ssh_user": status.inputs.get("ssh_user", ""),
"compose_cmd": status.inputs.get("compose_cmd", ""),
"ssh_port": status.inputs.get("ssh_port", ""),
"vm_target": status.inputs.get("vm_target", ""),
"vm_host": status.inputs.get("vm_host", ""),
"endpoint": status.inputs.get("endpoint", ""),
}
handle = self._handle_from_status(status)
backend.teardown(handle)
status.state = SandboxState.DESTROYED
@@ -218,4 +227,140 @@ class SandboxManager:
)
if existing.state != SandboxState.DESTROYED:
self.destroy(sandbox_id)
return self.create(request, host=existing.host)
return self.create(request, host=existing.host)
def snapshot(self, sandbox_id: str, *, name: str | None = None) -> SnapshotRecord:
status = self.store.get(sandbox_id)
if not status:
raise KeyError(f"Sandbox not found: {sandbox_id}")
if status.state != SandboxState.READY:
raise RuntimeError(
f"Sandbox must be ready to snapshot, got {status.state.value}"
)
extension = load_extension(status.extension_id)
backend = resolve_backend(extension)
if not backend.supports_snapshots():
raise RuntimeError(f"Extension {extension.id} does not support snapshots")
handle = self._handle_from_status(status)
meta = backend.snapshot(handle)
size_raw = meta.get("size_bytes", "")
size_bytes = int(size_raw) if size_raw.isdigit() else None
record = SnapshotRecord(
snapshot_id=meta["snapshot_id"],
sandbox_id=sandbox_id,
profile_id=status.profile_id,
extension_id=status.extension_id,
host=status.host or meta.get("host", ""),
artifact_path=meta.get("artifact_path", ""),
handle=handle,
inputs=dict(status.inputs),
consumer=status.consumer,
name=name,
size_bytes=size_bytes,
created_at=utcnow(),
)
self.snapshots.save(record)
emit_lifecycle_event(
status,
summary=f"Snapshot {record.snapshot_id} created from sandbox {sandbox_id}",
event_type="milestone",
)
return record
def get_snapshot(self, snapshot_id: str) -> SnapshotRecord | None:
return self.snapshots.get(snapshot_id)
def list_snapshots(self, *, sandbox_id: str | None = None) -> list[SnapshotRecord]:
items = self.snapshots.list_all()
if sandbox_id:
items = [s for s in items if s.sandbox_id == sandbox_id]
return sorted(items, key=lambda s: s.created_at, reverse=True)
def restore(
self,
snapshot_id: str,
*,
host: str | None = None,
consumer: Consumer | None = None,
) -> SandboxStatus:
record = self.snapshots.get(snapshot_id)
if not record:
raise KeyError(f"Snapshot not found: {snapshot_id}")
profile = load_profile(record.profile_id)
extension = load_extension(record.extension_id)
backend = resolve_backend(extension)
if not backend.supports_snapshots():
raise RuntimeError(f"Extension {extension.id} does not support restore")
resolved_host = host or record.host
if not resolved_host:
resolved_host = resolve_host(profile)
use_consumer = consumer or record.consumer
if not use_consumer:
raise ValueError("consumer required for restore (not stored on snapshot)")
now = utcnow()
status = SandboxStatus(
sandbox_id="pending",
profile_id=record.profile_id,
extension_id=record.extension_id,
state=SandboxState.REQUESTED,
consumer=use_consumer,
host=resolved_host,
inputs=dict(record.inputs),
created_at=now,
updated_at=now,
)
emit_lifecycle_event(status, event_type=event_type_for_state(status.state))
status.state = SandboxState.PROVISIONING
status.updated_at = utcnow()
emit_lifecycle_event(status, event_type=event_type_for_state(status.state))
snapshot_meta = {
"snapshot_id": record.snapshot_id,
"artifact_path": record.artifact_path,
"host": record.host,
**record.handle,
}
try:
handle = backend.restore_from_snapshot(
profile, snapshot_meta, record.inputs, resolved_host
)
status.sandbox_id = handle["sandbox_id"]
status.inputs["compose_file"] = handle.get("compose_file", "")
status.inputs["ssh_user"] = handle.get("ssh_user", "")
status.inputs["compose_cmd"] = handle.get("compose_cmd", "")
status.inputs["ssh_port"] = handle.get("ssh_port", "")
status.inputs["vm_target"] = handle.get("vm_target", "")
status.inputs["vm_host"] = handle.get("vm_host", "")
status.inputs["endpoint"] = handle.get("endpoint", "")
status.inputs["restored_from"] = record.snapshot_id
reach = backend.wait_ready(handle)
status.reachability = Reachability(**reach)
status.state = SandboxState.READY
status.ready_at = utcnow()
status.updated_at = status.ready_at
self.store.save(status)
emit_lifecycle_event(
status,
summary=f"Sandbox restored from snapshot {snapshot_id}",
event_type=event_type_for_state(status.state),
)
return status
except Exception as exc:
status.state = SandboxState.FAILED
status.error = str(exc)
status.updated_at = utcnow()
if status.sandbox_id != "pending":
self.store.save(status)
emit_lifecycle_event(
status,
summary=f"Snapshot restore failed: {exc}",
event_type=event_type_for_state(status.state),
)
raise