File size: 814 Bytes
678c431
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/bin/bash
# Distributed training launch script for Phi-4 training
# This script uses torchrun to launch multi-GPU training

# Set the number of GPUs to use (defaults to all available)
NUM_GPUS=${1:-4}

# Check if torchrun is available
if ! command -v torchrun &> /dev/null; then
    echo "torchrun command not found. Make sure PyTorch is installed properly."
    echo "Try: pip install torch>=2.0.0"
    exit 1
fi

echo "Launching distributed training with $NUM_GPUS GPUs..."

# Launch the distributed training
torchrun --nproc_per_node=$NUM_GPUS \
    --master_port=29500 \
    run_transformers_training.py \
    --config transformers_config.json

# Check exit status
if [ $? -eq 0 ]; then
    echo "Distributed training completed successfully!"
else
    echo "Distributed training failed with exit code $?"
fi