hf-train-frontend / hardware_config.json
George-API's picture
Upload folder using huggingface_hub
a57357b verified
raw
history blame
1.31 kB
{
"hardware_name": "2xA10G",
"specs": {
"gpu_count": 2,
"gpu_type": "A10G",
"vram_per_gpu": 24,
"total_vram": 48,
"vcpu_count": 24,
"ram": 92
},
"training_optimizations": {
"per_device_batch_size": 16,
"gradient_accumulation_steps": 4,
"effective_batch_size": 128,
"memory_optimizations": {
"use_gradient_checkpointing": true,
"pin_memory": true,
"num_workers": 2
},
"distributed_settings": {
"device_map": "auto",
"ddp_find_unused_parameters": false
}
},
"memory_breakdown": {
"model_size": "~3.5GB (pre-quantized 4-bit)",
"optimizer_states": "~1GB",
"batch_memory_per_gpu": "~2GB",
"peak_memory_estimate": "18-20GB",
"safe_headroom": "4-6GB"
},
"compute_environment": "A10G_CLOUD",
"distributed_type": "DATA_PARALLEL",
"mixed_precision": "bf16",
"num_gpus": 2,
"training_parameters": {
"per_device_train_batch_size": 16,
"gradient_accumulation_steps": 4,
"dataloader_num_workers": 2,
"dataloader_pin_memory": true,
"gradient_checkpointing": true,
"max_grad_norm": 1.0
},
"memory_optimization": {
"offload_to_cpu": false,
"use_flash_attention": true,
"use_gradient_checkpointing": true
}
}