Vast.ai Data Handling
Overview
Manage training data and model artifacts securely on Vast.ai GPU instances. Covers data transfer, encryption, checkpoint management, and cleanup. Critical consideration: Vast.ai instances run on shared hardware operated by third-party hosts.
Prerequisites
- Vast.ai instance with SSH access
- Cloud storage (S3, GCS) for persistent artifacts
- Understanding of data sensitivity classification
Instructions
Step 1: Data Transfer Patterns
# Small datasets (<5GB): Direct SCP
scp -P $PORT -r ./data/ root@$HOST:/workspace/data/
# Large datasets (5-50GB): Compressed transfer
tar czf - ./data/ | ssh -p $PORT root@$HOST "tar xzf - -C /workspace/"
# Very large datasets (>50GB): Cloud storage staging
# Upload to S3/GCS first, then download on instance
ssh -p $PORT root@$HOST "aws s3 sync s3://bucket/dataset/ /workspace/data/"
Step 2: Encrypted Data Transfer
import subprocess, os
def encrypt_and_upload(local_path, host, port, remote_path, passphrase):
"""Encrypt data before transferring to Vast.ai instance."""
encrypted = f"{local_path}.enc"
# Encrypt with AES-256
subprocess.run([
"openssl", "enc", "-aes-256-cbc", "-salt", "-pbkdf2",
"-in", local_path, "-out", encrypted,
"-pass", f"pass:{passphrase}",
], check=True)
# Transfer encrypted file
subprocess.run([
"scp", "-P", str(port), encrypted,
f"root@{host}:{remote_path}.enc",
], check=True)
# Decrypt on instance
subprocess.run([
"ssh", "-p", str(port), f"root@{host}",
f"openssl enc -aes-256-cbc -d -pbkdf2 "
f"-in {remote_path}.enc -out {remote_path} "
f"-pass pass:{passphrase} && rm {remote_path}.enc"
], check=True)
os.remove(encrypted)
Step 3: Checkpoint to Cloud Storage
import torch, boto3, os
class CloudCheckpointManager:
def __init__(self, s3_bucket, prefix, save_every=500):
self.s3 = boto3.client("s3")
self.bucket = s3_bucket
self.prefix = prefix
self.save_every = save_every
def save(self, model, optimizer, step, loss):
if step % self.save_every != 0:
return
local_path = f"/tmp/ckpt-{step}.pt"
torch.save({
"step": step, "loss": loss,
"model": model.state_dict(),
"optimizer": optimizer.state_dict(),
}, local_path)
self.s3.upload_file(local_path, self.bucket,
f"{self.pre