#!/bin/bash # Distributed training launch script for Phi-4 training # This script uses torchrun to launch multi-GPU training # Set the number of GPUs to use (defaults to all available) NUM_GPUS=${1:-4} # Check if torchrun is available if ! command -v torchrun &> /dev/null; then echo "torchrun command not found. Make sure PyTorch is installed properly." echo "Try: pip install torch>=2.0.0" exit 1 fi echo "Launching distributed training with $NUM_GPUS GPUs..." # Launch the distributed training torchrun --nproc_per_node=$NUM_GPUS \ --master_port=29500 \ run_transformers_training.py \ --config transformers_config.json # Check exit status if [ $? -eq 0 ]; then echo "Distributed training completed successfully!" else echo "Distributed training failed with exit code $?" fi