accelerate launch \ --mixed_precision bf16 -m sdlm.run_pretrain_ar \ --per_device_train_batch_size 1 \ --per_device_eval_batch_size 1 \ --do_train \ --do_eval \ --log_level info \ --evaluation_strategy steps \ --report_to tensorboard \ --max_seq_length 1 \ --lr_scheduler_type constant_with_warmup \ --learning_rate 1e-5 \ --pad_to_max_length \ --max_steps 10000000 \ --warmup_steps 5000 \ --logging_steps 50 \ --save_total_limit 1 \ --dataset_name emozilla/dolma-v1_7-305B \ --streaming \ --bf16 \ --optim adamw_torch_fused \ --gradient_checkpointing \ --use_flash_attention2 \ --ddp_find_unused_parameters false \ --without_compute_metrics true \ --dataloader_num_workers 8 \ --remove_unused_columns true \ --dispatch_batches false \ --shuffle true \ --preprocessing_num_workers 16 \ --model_name_or_path mistralai/Mistral-7B-v0.1 \ --model_revision 26bca36bde8333b5d7f72e9ed20ccda6a618af24 \ --eval_steps 10 \ --save_steps 50 \ --max_eval_samples 16 \ --gradient_accumulation_steps 1 \ --output_dir outputs/test \ --overwrite_output_dir true \ --tokenizer_padding_side "left" \ --num_diffusion_steps 0