Spaces:
Runtime error
Runtime error
File size: 814 Bytes
678c431 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
#!/bin/bash
# Distributed training launch script for Phi-4 training
# This script uses torchrun to launch multi-GPU training
# Set the number of GPUs to use (defaults to all available)
NUM_GPUS=${1:-4}
# Check if torchrun is available
if ! command -v torchrun &> /dev/null; then
echo "torchrun command not found. Make sure PyTorch is installed properly."
echo "Try: pip install torch>=2.0.0"
exit 1
fi
echo "Launching distributed training with $NUM_GPUS GPUs..."
# Launch the distributed training
torchrun --nproc_per_node=$NUM_GPUS \
--master_port=29500 \
run_transformers_training.py \
--config transformers_config.json
# Check exit status
if [ $? -eq 0 ]; then
echo "Distributed training completed successfully!"
else
echo "Distributed training failed with exit code $?"
fi |