hf-train-frontend / run_distributed.sh
George-API's picture
Upload folder using huggingface_hub
678c431 verified
raw
history blame contribute delete
814 Bytes
#!/bin/bash
# Distributed training launch script for Phi-4 training
# This script uses torchrun to launch multi-GPU training
# Set the number of GPUs to use (defaults to all available)
NUM_GPUS=${1:-4}
# Check if torchrun is available
if ! command -v torchrun &> /dev/null; then
echo "torchrun command not found. Make sure PyTorch is installed properly."
echo "Try: pip install torch>=2.0.0"
exit 1
fi
echo "Launching distributed training with $NUM_GPUS GPUs..."
# Launch the distributed training
torchrun --nproc_per_node=$NUM_GPUS \
--master_port=29500 \
run_transformers_training.py \
--config transformers_config.json
# Check exit status
if [ $? -eq 0 ]; then
echo "Distributed training completed successfully!"
else
echo "Distributed training failed with exit code $?"
fi