Spaces:
Runtime error
Runtime error
# Distributed training launch script for Phi-4 training | |
# This script uses torchrun to launch multi-GPU training | |
# Set the number of GPUs to use (defaults to all available) | |
NUM_GPUS=${1:-4} | |
# Check if torchrun is available | |
if ! command -v torchrun &> /dev/null; then | |
echo "torchrun command not found. Make sure PyTorch is installed properly." | |
echo "Try: pip install torch>=2.0.0" | |
exit 1 | |
fi | |
echo "Launching distributed training with $NUM_GPUS GPUs..." | |
# Launch the distributed training | |
torchrun --nproc_per_node=$NUM_GPUS \ | |
--master_port=29500 \ | |
run_transformers_training.py \ | |
--config transformers_config.json | |
# Check exit status | |
if [ $? -eq 0 ]; then | |
echo "Distributed training completed successfully!" | |
else | |
echo "Distributed training failed with exit code $?" | |
fi |