George-API commited on
Commit
adb15f9
·
verified ·
1 Parent(s): 7e5a6ad

Upload folder using huggingface_hub

Browse files
hardware_config.json CHANGED
@@ -1,56 +1,56 @@
1
- {
2
- "hardware_name": "4xL4",
3
- "specs": {
4
- "gpu_count": 4,
5
- "gpu_type": "L4",
6
- "vram_per_gpu": 24,
7
- "total_vram": 96,
8
- "vcpu_count": 48,
9
- "ram": 186
10
- },
11
- "training_optimizations": {
12
- "per_device_batch_size": 24,
13
- "gradient_accumulation_steps": 2,
14
- "effective_batch_size": 192,
15
- "memory_optimizations": {
16
- "use_gradient_checkpointing": true,
17
- "pin_memory": true,
18
- "num_workers": 4,
19
- "use_flash_attention": true
20
- },
21
- "distributed_settings": {
22
- "device_map": "auto",
23
- "ddp_find_unused_parameters": false,
24
- "use_fsdp": true,
25
- "fsdp_config": {
26
- "sharding_strategy": "FULL_SHARD",
27
- "mixed_precision": "BF16",
28
- "activation_checkpointing": true
29
- }
30
- }
31
- },
32
- "memory_breakdown": {
33
- "model_size": "~3.5GB (pre-quantized 4-bit)",
34
- "optimizer_states": "~1GB",
35
- "batch_memory_per_gpu": "~3GB",
36
- "peak_memory_estimate": "~18GB",
37
- "safe_headroom": "~6GB"
38
- },
39
- "compute_environment": "L4_CLOUD",
40
- "distributed_type": "FSDP",
41
- "mixed_precision": "bf16",
42
- "num_gpus": 4,
43
- "training_parameters": {
44
- "per_device_train_batch_size": 24,
45
- "gradient_accumulation_steps": 2,
46
- "dataloader_num_workers": 4,
47
- "dataloader_pin_memory": true,
48
- "gradient_checkpointing": true,
49
- "max_grad_norm": 1.0
50
- },
51
- "memory_optimization": {
52
- "offload_to_cpu": false,
53
- "use_flash_attention": true,
54
- "use_gradient_checkpointing": true
55
- }
56
  }
 
1
+ {
2
+ "hardware_name": "4xL4",
3
+ "specs": {
4
+ "gpu_count": 4,
5
+ "gpu_type": "L4",
6
+ "vram_per_gpu": 24,
7
+ "total_vram": 96,
8
+ "vcpu_count": 48,
9
+ "ram": 186
10
+ },
11
+ "training_optimizations": {
12
+ "per_device_batch_size": 24,
13
+ "gradient_accumulation_steps": 2,
14
+ "effective_batch_size": 192,
15
+ "memory_optimizations": {
16
+ "use_gradient_checkpointing": true,
17
+ "pin_memory": true,
18
+ "num_workers": 4,
19
+ "use_flash_attention": true
20
+ },
21
+ "distributed_settings": {
22
+ "device_map": "auto",
23
+ "ddp_find_unused_parameters": false,
24
+ "use_fsdp": true,
25
+ "fsdp_config": {
26
+ "sharding_strategy": "FULL_SHARD",
27
+ "mixed_precision": "BF16",
28
+ "activation_checkpointing": true
29
+ }
30
+ }
31
+ },
32
+ "memory_breakdown": {
33
+ "model_size": "~3.5GB (pre-quantized 4-bit)",
34
+ "optimizer_states": "~1GB",
35
+ "batch_memory_per_gpu": "~3GB",
36
+ "peak_memory_estimate": "~18GB",
37
+ "safe_headroom": "~6GB"
38
+ },
39
+ "compute_environment": "L4_CLOUD",
40
+ "distributed_type": "FSDP",
41
+ "mixed_precision": "bf16",
42
+ "num_gpus": 4,
43
+ "training_parameters": {
44
+ "per_device_train_batch_size": 24,
45
+ "gradient_accumulation_steps": 2,
46
+ "dataloader_num_workers": 4,
47
+ "dataloader_pin_memory": true,
48
+ "gradient_checkpointing": true,
49
+ "max_grad_norm": 1.0
50
+ },
51
+ "memory_optimization": {
52
+ "offload_to_cpu": false,
53
+ "use_flash_attention": true,
54
+ "use_gradient_checkpointing": true
55
+ }
56
  }
run_transformers_training.py CHANGED
@@ -39,6 +39,21 @@ logging.basicConfig(
39
  )
40
  logger = logging.getLogger(__name__)
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  # Check for BitsAndBytes
43
  try:
44
  from transformers import BitsAndBytesConfig
@@ -142,6 +157,7 @@ def load_model_and_tokenizer(config):
142
  raise ValueError("Model name not found in configuration. Please check your transformers_config.json file.")
143
 
144
  logger.info("Using Unsloth optimizations with pre-quantized model")
 
145
  # Check for flash attention without importing it directly
146
  use_flash_attention = config.get("use_flash_attention", True)
147
  try:
@@ -153,11 +169,29 @@ def load_model_and_tokenizer(config):
153
 
154
  # First detect if we have a GPU
155
  if torch.cuda.is_available():
156
- logger.info(f"CUDA available, found {torch.cuda.device_count()} GPU(s)")
157
- device_map = "auto"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
158
  else:
159
  logger.warning("No CUDA available, falling back to CPU")
160
  device_map = {"": "cpu"} # Force CPU placement
 
161
 
162
  # Set default dtype for better numerics
163
  if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
@@ -175,11 +209,15 @@ def load_model_and_tokenizer(config):
175
 
176
  # Load model with proper error handling for out-of-memory
177
  try:
 
 
 
178
  model, tokenizer = FastLanguageModel.from_pretrained(
179
  model_name=model_name,
180
  max_seq_length=config.get("max_seq_length", 2048) or config.get("tokenizer", {}).get("max_seq_length", 2048),
181
  dtype=dtype,
182
  device_map=device_map,
 
183
  # Don't explicitly use flash attention config here, let Unsloth handle it
184
  )
185
  except RuntimeError as e:
@@ -355,7 +393,7 @@ def format_phi_chat(messages, dataset_config):
355
 
356
  role = message.get("role", "").lower()
357
  content = message.get("content", "")
358
-
359
  # Format based on role
360
  if role == "human" or role == "user":
361
  template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
@@ -413,9 +451,9 @@ class SimpleDataCollator:
413
  return_tensors=None,
414
  add_generation_prompt=False
415
  )
416
- except:
417
  # Fallback if apply_chat_template fails
418
- logger.warning(f"Chat template application failed for example {paper_id}, using basic tokenization")
419
 
420
  # Create a basic representation of the conversation
421
  conversation_text = ""
@@ -494,6 +532,70 @@ class SimpleDataCollator:
494
 
495
  return batch
496
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
497
  def check_dependencies():
498
  """Check if all required dependencies are installed."""
499
  missing_packages = []
@@ -525,7 +627,7 @@ def check_dependencies():
525
 
526
  def main():
527
  # Set up logging
528
- logger.info("Starting training process")
529
 
530
  # Parse arguments
531
  args = parse_args()
@@ -566,8 +668,9 @@ def main():
566
  logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
567
  return 1
568
 
569
- logger.info(f"Model name: {model_config.get('model', {}).get('name') or model_config.get('model_name_or_path') or model_config.get('model_name')}")
570
- logger.info("All configurations loaded successfully")
 
571
 
572
  # Extract specific configs
573
  model_config = configs["transformers"]
@@ -582,11 +685,11 @@ def main():
582
 
583
  if per_device_batch_size and model_config.get("training"):
584
  model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
585
- logger.info(f"Applied hardware-specific batch size: {per_device_batch_size}")
586
 
587
  if gradient_accumulation and model_config.get("training"):
588
  model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
589
- logger.info(f"Applied hardware-specific gradient accumulation: {gradient_accumulation}")
590
 
591
  # Apply memory optimizations
592
  memory_opts = training_opts.get("memory_optimizations", {})
@@ -600,28 +703,39 @@ def main():
600
  # Set random seed for reproducibility
601
  seed = model_config.get("seed", 42)
602
  set_seed(seed)
603
- logger.info(f"Set random seed to {seed}")
604
 
605
  # Check CUDA and set environment variables for better memory management
606
  if torch.cuda.is_available():
607
  # Empty CUDA cache
608
  torch.cuda.empty_cache()
609
- # Set memory management env vars (optional)
610
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
611
- # Log memory information
 
 
 
612
  for i in range(torch.cuda.device_count()):
613
- logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
614
- logger.info(f"Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
615
- logger.info(f"Memory Reserved: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
 
 
 
 
 
 
616
 
617
  try:
 
618
  model, tokenizer = load_model_and_tokenizer(model_config)
619
- logger.info("Model and tokenizer loaded successfully")
620
 
621
  # Load dataset with proper mapping
622
  try:
 
623
  dataset = load_dataset_with_mapping(dataset_config)
624
- logger.info("Dataset loaded and prepared successfully")
625
  except Exception as e:
626
  logger.error(f"Error loading dataset: {e}")
627
  return 1
@@ -629,38 +743,64 @@ def main():
629
  # Create data collator
630
  data_collator = SimpleDataCollator(tokenizer, dataset_config)
631
 
632
- # Simple logging callback
633
- class LoggingCallback(TrainerCallback):
634
- def __init__(self):
635
- self.last_log_time = time.time()
636
-
637
- def on_step_end(self, args, state, control, **kwargs):
638
- # Log every 50 steps or every 5 minutes, whichever comes first
639
- current_time = time.time()
640
- if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
641
- logger.info(f"Step {state.global_step}: Loss {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")
642
- self.last_log_time = current_time
643
-
644
- def on_train_begin(self, args, state, control, **kwargs):
645
- logger.info("Training is starting...")
646
- # Log memory information
647
- if torch.cuda.is_available():
648
- for i in range(torch.cuda.device_count()):
649
- logger.info(f"GPU {i} Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
650
-
651
  # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
652
  use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
653
  use_fp16 = model_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
654
 
655
- logger.info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
 
657
  # Set up training arguments
658
- logger.info("Setting up training arguments")
659
  training_args = TrainingArguments(
660
  output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
661
  num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
662
- per_device_train_batch_size=model_config.get("training", {}).get("per_device_train_batch_size", 24),
663
- gradient_accumulation_steps=model_config.get("training", {}).get("gradient_accumulation_steps", 2),
664
  learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
665
  weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
666
  warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
@@ -682,15 +822,16 @@ def main():
682
  optim=model_config.get("training", {}).get("optim", "adamw_torch"),
683
  ddp_find_unused_parameters=False, # Improve distributed training efficiency
684
  dataloader_drop_last=False, # Process all examples
685
- dataloader_num_workers=4, # Sequential data loading
686
  no_cuda=False if torch.cuda.is_available() else True, # Use CUDA if available
 
687
  )
688
 
689
  # Create sequential sampler to maintain original dataset order
690
  sequential_sampler = torch.utils.data.SequentialSampler(dataset)
691
 
692
  # Initialize trainer first
693
- logger.info("Initializing Trainer")
694
  trainer = Trainer(
695
  model=model,
696
  args=training_args,
@@ -702,7 +843,7 @@ def main():
702
  # Then override the get_train_dataloader method
703
  def custom_get_train_dataloader():
704
  """Custom dataloader that preserves original dataset order"""
705
- logger.info("Creating sequential dataloader to maintain original dataset order")
706
 
707
  # Calculate batch size based on device availability
708
  if getattr(training_args, "no_cuda", False):
@@ -710,7 +851,7 @@ def main():
710
  else:
711
  batch_size = max(training_args.per_device_train_batch_size * max(1, torch.cuda.device_count()), 1)
712
 
713
- logger.info(f"Using sequential sampler with batch size {batch_size}")
714
 
715
  # Return DataLoader with sequential sampler
716
  return torch.utils.data.DataLoader(
@@ -727,28 +868,44 @@ def main():
727
  trainer.get_train_dataloader = custom_get_train_dataloader
728
 
729
  # Start training
730
- logger.info("Starting training process")
731
  try:
 
 
 
 
 
 
 
 
 
732
  trainer.train()
733
- logger.info("Training completed successfully")
734
 
735
  # Save the final model
736
- logger.info("Saving final model")
737
  trainer.save_model()
 
738
 
739
  # Push to hub if enabled
740
  if model_config.get("huggingface_hub", {}).get("push_to_hub", False):
741
- logger.info("Pushing model to Hugging Face Hub")
 
742
  trainer.push_to_hub()
 
743
 
744
  return 0
745
  except Exception as e:
746
  logger.error(f"Training failed with error: {str(e)}")
747
- # Log CUDA memory info if available
748
  if torch.cuda.is_available():
 
749
  for i in range(torch.cuda.device_count()):
750
- logger.info(f"GPU {i} Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
751
- logger.info(f"GPU {i} Memory Reserved: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
 
 
 
752
  raise
753
 
754
  except Exception as e:
 
39
  )
40
  logger = logging.getLogger(__name__)
41
 
42
+ # Set other loggers to WARNING to reduce noise and ensure our logs are visible
43
+ logging.getLogger("transformers").setLevel(logging.WARNING)
44
+ logging.getLogger("datasets").setLevel(logging.WARNING)
45
+ logging.getLogger("accelerate").setLevel(logging.WARNING)
46
+ logging.getLogger("torch").setLevel(logging.WARNING)
47
+ logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
48
+
49
+ # Define a clean logging function for HF Space compatibility
50
+ def log_info(message):
51
+ """Log information in a format compatible with Hugging Face Spaces"""
52
+ # Just use the logger, but ensure consistent formatting
53
+ logger.info(message)
54
+ # Also ensure output is flushed immediately for streaming
55
+ sys.stdout.flush()
56
+
57
  # Check for BitsAndBytes
58
  try:
59
  from transformers import BitsAndBytesConfig
 
157
  raise ValueError("Model name not found in configuration. Please check your transformers_config.json file.")
158
 
159
  logger.info("Using Unsloth optimizations with pre-quantized model")
160
+
161
  # Check for flash attention without importing it directly
162
  use_flash_attention = config.get("use_flash_attention", True)
163
  try:
 
169
 
170
  # First detect if we have a GPU
171
  if torch.cuda.is_available():
172
+ gpu_count = torch.cuda.device_count()
173
+ logger.info(f"CUDA available, found {gpu_count} GPU(s)")
174
+
175
+ # Log GPU info
176
+ for i in range(gpu_count):
177
+ logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
178
+ logger.info(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
179
+
180
+ # Create an optimized device map for better balance
181
+ if gpu_count > 1:
182
+ logger.info(f"Creating balanced device map for {gpu_count} GPUs")
183
+ # Use auto mapping but with memory tracking
184
+ device_map = "auto"
185
+ # Set max memory for better balancing
186
+ max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory * 0.85 / 1024**3)}GiB" for i in range(gpu_count)}
187
+ logger.info(f"Max memory settings: {max_memory}")
188
+ else:
189
+ device_map = "auto"
190
+ max_memory = None
191
  else:
192
  logger.warning("No CUDA available, falling back to CPU")
193
  device_map = {"": "cpu"} # Force CPU placement
194
+ max_memory = None
195
 
196
  # Set default dtype for better numerics
197
  if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
 
209
 
210
  # Load model with proper error handling for out-of-memory
211
  try:
212
+ # Improved memory settings for multi-GPU setup
213
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
214
+
215
  model, tokenizer = FastLanguageModel.from_pretrained(
216
  model_name=model_name,
217
  max_seq_length=config.get("max_seq_length", 2048) or config.get("tokenizer", {}).get("max_seq_length", 2048),
218
  dtype=dtype,
219
  device_map=device_map,
220
+ max_memory=max_memory,
221
  # Don't explicitly use flash attention config here, let Unsloth handle it
222
  )
223
  except RuntimeError as e:
 
393
 
394
  role = message.get("role", "").lower()
395
  content = message.get("content", "")
396
+
397
  # Format based on role
398
  if role == "human" or role == "user":
399
  template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
 
451
  return_tensors=None,
452
  add_generation_prompt=False
453
  )
454
+ except Exception as chat_error:
455
  # Fallback if apply_chat_template fails
456
+ logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)[:100]}")
457
 
458
  # Create a basic representation of the conversation
459
  conversation_text = ""
 
532
 
533
  return batch
534
 
535
+ class LoggingCallback(TrainerCallback):
536
+ def __init__(self):
537
+ self.last_log_time = time.time()
538
+ self.last_memory_log_time = time.time()
539
+
540
+ def on_step_end(self, args, state, control, **kwargs):
541
+ # Log every 50 steps or every 5 minutes, whichever comes first
542
+ current_time = time.time()
543
+
544
+ # Log loss every 50 steps or 5 minutes
545
+ if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
546
+ if state.log_history:
547
+ loss = state.log_history[-1].get('loss', 'N/A')
548
+ # Use simple formatting for better HF Space log compatibility
549
+ log_info(f"Step {state.global_step}: Loss {loss}")
550
+ else:
551
+ log_info(f"Step {state.global_step}: No loss data available")
552
+ self.last_log_time = current_time
553
+
554
+ # Log memory usage every 15 minutes
555
+ if current_time - self.last_memory_log_time > 900: # 15 minutes
556
+ if torch.cuda.is_available():
557
+ memory_info = []
558
+ for i in range(torch.cuda.device_count()):
559
+ allocated = torch.cuda.memory_allocated(i) / 1024**2
560
+ reserved = torch.cuda.memory_reserved(i) / 1024**2
561
+ memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB")
562
+
563
+ # Log in compact format for better visibility
564
+ log_info(f"Memory usage - {', '.join(memory_info)}")
565
+ self.last_memory_log_time = current_time
566
+
567
+ def on_train_begin(self, args, state, control, **kwargs):
568
+ log_info("=== Training is starting ===")
569
+
570
+ # Log important training parameters for visibility
571
+ log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
572
+ log_info(f"Learning rate: {args.learning_rate}")
573
+ log_info(f"Epochs: {args.num_train_epochs}")
574
+
575
+ # Log memory information in compact format
576
+ if torch.cuda.is_available():
577
+ memory_info = []
578
+ for i in range(torch.cuda.device_count()):
579
+ allocated = torch.cuda.memory_allocated(i) / 1024**2
580
+ max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
581
+ memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
582
+
583
+ log_info(f"Initial memory usage - {', '.join(memory_info)}")
584
+
585
+ def on_train_end(self, args, state, control, **kwargs):
586
+ log_info("=== Training completed ===")
587
+ if torch.cuda.is_available():
588
+ memory_info = []
589
+ for i in range(torch.cuda.device_count()):
590
+ allocated = torch.cuda.memory_allocated(i) / 1024**2
591
+ max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
592
+ memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
593
+
594
+ log_info(f"Final memory usage - {', '.join(memory_info)}")
595
+
596
+ log_info(f"Total steps: {state.global_step}")
597
+ log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
598
+
599
  def check_dependencies():
600
  """Check if all required dependencies are installed."""
601
  missing_packages = []
 
627
 
628
  def main():
629
  # Set up logging
630
+ log_info("Starting Phi-4 fine-tuning process")
631
 
632
  # Parse arguments
633
  args = parse_args()
 
668
  logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
669
  return 1
670
 
671
+ model_name = model_config.get("model", {}).get("name") or model_config.get("model_name_or_path") or model_config.get("model_name")
672
+ log_info(f"Using model: {model_name}")
673
+ log_info("All configurations loaded successfully")
674
 
675
  # Extract specific configs
676
  model_config = configs["transformers"]
 
685
 
686
  if per_device_batch_size and model_config.get("training"):
687
  model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
688
+ log_info(f"Applied hardware-specific batch size: {per_device_batch_size}")
689
 
690
  if gradient_accumulation and model_config.get("training"):
691
  model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
692
+ log_info(f"Applied hardware-specific gradient accumulation: {gradient_accumulation}")
693
 
694
  # Apply memory optimizations
695
  memory_opts = training_opts.get("memory_optimizations", {})
 
703
  # Set random seed for reproducibility
704
  seed = model_config.get("seed", 42)
705
  set_seed(seed)
706
+ log_info(f"Set random seed to {seed} for reproducibility")
707
 
708
  # Check CUDA and set environment variables for better memory management
709
  if torch.cuda.is_available():
710
  # Empty CUDA cache
711
  torch.cuda.empty_cache()
712
+
713
+ # Set memory management env vars for better fragmentation handling
714
+ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
715
+
716
+ # Log initial memory information in a compact form
717
+ gpu_info = []
718
  for i in range(torch.cuda.device_count()):
719
+ name = torch.cuda.get_device_name(i)
720
+ allocated = torch.cuda.memory_allocated(i) / 1024**3
721
+ total = torch.cuda.get_device_properties(i).total_memory / 1024**3
722
+ gpu_info.append(f"GPU {i}: {name} ({allocated:.1f}GB/{total:.1f}GB)")
723
+
724
+ log_info(f"Hardware: {torch.cuda.device_count()} GPUs detected")
725
+ log_info(f"GPU details: {', '.join(gpu_info)}")
726
+ else:
727
+ log_info("No GPU detected, using CPU (training will be very slow)")
728
 
729
  try:
730
+ log_info("Loading model and tokenizer...")
731
  model, tokenizer = load_model_and_tokenizer(model_config)
732
+ log_info("Model and tokenizer loaded successfully")
733
 
734
  # Load dataset with proper mapping
735
  try:
736
+ log_info(f"Loading dataset from {dataset_config.get('dataset', {}).get('name', '')}")
737
  dataset = load_dataset_with_mapping(dataset_config)
738
+ log_info(f"Dataset loaded with {len(dataset)} examples")
739
  except Exception as e:
740
  logger.error(f"Error loading dataset: {e}")
741
  return 1
 
743
  # Create data collator
744
  data_collator = SimpleDataCollator(tokenizer, dataset_config)
745
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
746
  # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
747
  use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
748
  use_fp16 = model_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
749
 
750
+ log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
751
+
752
+ # Get per device batch size - temporarily reduce if necessary for multi-GPU setup
753
+ per_device_batch_size = model_config.get("training", {}).get("per_device_train_batch_size", 24)
754
+ gradient_accumulation_steps = model_config.get("training", {}).get("gradient_accumulation_steps", 2)
755
+
756
+ # For multi-GPU setup, adjust for better balance
757
+ if torch.cuda.device_count() > 1:
758
+ log_info(f"Multi-GPU setup with {torch.cuda.device_count()} GPUs")
759
+ log_info(f"Training config: {per_device_batch_size} samples/GPU × {gradient_accumulation_steps} accumulation steps")
760
+
761
+ # Set up FSDP for multi-GPU training if available
762
+ fsdp_config = None
763
+ if torch.cuda.device_count() > 1:
764
+ try:
765
+ from torch.distributed.fsdp import (
766
+ FullyShardedDataParallel as FSDP,
767
+ MixedPrecision,
768
+ BackwardPrefetch,
769
+ ShardingStrategy,
770
+ CPUOffload,
771
+ )
772
+ from torch.distributed.fsdp.wrap import (
773
+ transformer_auto_wrap_policy,
774
+ enable_wrap,
775
+ wrap,
776
+ )
777
+
778
+ log_info("Using FSDP for distributed training")
779
+
780
+ # Configure FSDP
781
+ fsdp_config = {
782
+ "fsdp_transformer_layer_cls_to_wrap": ["LlamaDecoderLayer"],
783
+ "fsdp_offload_params": False,
784
+ "fsdp_backward_prefetch": "BACKWARD_PRE",
785
+ "fsdp_min_num_params": 1e6,
786
+ "fsdp_sharding_strategy": 1, # FULL_SHARD
787
+ }
788
+
789
+ if use_bf16 or use_fp16:
790
+ precision_type = "bf16" if use_bf16 else "fp16"
791
+ fsdp_config["fsdp_state_dict_type"] = "FULL_STATE_DICT"
792
+ log_info(f"FSDP using mixed precision: {precision_type}")
793
+ except ImportError:
794
+ log_info("FSDP imports failed, falling back to standard DDP")
795
+ fsdp_config = None
796
 
797
  # Set up training arguments
798
+ log_info("Setting up training arguments")
799
  training_args = TrainingArguments(
800
  output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
801
  num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
802
+ per_device_train_batch_size=per_device_batch_size,
803
+ gradient_accumulation_steps=gradient_accumulation_steps,
804
  learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
805
  weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
806
  warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
 
822
  optim=model_config.get("training", {}).get("optim", "adamw_torch"),
823
  ddp_find_unused_parameters=False, # Improve distributed training efficiency
824
  dataloader_drop_last=False, # Process all examples
825
+ dataloader_num_workers=2, # Reduced worker count
826
  no_cuda=False if torch.cuda.is_available() else True, # Use CUDA if available
827
+ fsdp=fsdp_config, # Add FSDP configuration if available
828
  )
829
 
830
  # Create sequential sampler to maintain original dataset order
831
  sequential_sampler = torch.utils.data.SequentialSampler(dataset)
832
 
833
  # Initialize trainer first
834
+ log_info("Initializing Trainer")
835
  trainer = Trainer(
836
  model=model,
837
  args=training_args,
 
843
  # Then override the get_train_dataloader method
844
  def custom_get_train_dataloader():
845
  """Custom dataloader that preserves original dataset order"""
846
+ log_info("Creating sequential dataloader to maintain original dataset order")
847
 
848
  # Calculate batch size based on device availability
849
  if getattr(training_args, "no_cuda", False):
 
851
  else:
852
  batch_size = max(training_args.per_device_train_batch_size * max(1, torch.cuda.device_count()), 1)
853
 
854
+ log_info(f"Using sequential sampler with batch size {batch_size}")
855
 
856
  # Return DataLoader with sequential sampler
857
  return torch.utils.data.DataLoader(
 
868
  trainer.get_train_dataloader = custom_get_train_dataloader
869
 
870
  # Start training
871
+ log_info("=== Starting Training ===")
872
  try:
873
+ # Empty cache again right before training
874
+ if torch.cuda.is_available():
875
+ torch.cuda.empty_cache()
876
+ log_info("Cleared CUDA cache before training")
877
+
878
+ # Display compact training info
879
+ total_steps = int(len(dataset) / (per_device_batch_size * torch.cuda.device_count() * gradient_accumulation_steps) * training_args.num_train_epochs)
880
+ log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
881
+
882
  trainer.train()
883
+ log_info("Training completed successfully!")
884
 
885
  # Save the final model
886
+ log_info("Saving final model...")
887
  trainer.save_model()
888
+ log_info(f"Model saved to {training_args.output_dir}")
889
 
890
  # Push to hub if enabled
891
  if model_config.get("huggingface_hub", {}).get("push_to_hub", False):
892
+ hub_id = model_config.get("huggingface_hub", {}).get("hub_model_id", "model")
893
+ log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
894
  trainer.push_to_hub()
895
+ log_info("Model successfully pushed to Hub")
896
 
897
  return 0
898
  except Exception as e:
899
  logger.error(f"Training failed with error: {str(e)}")
900
+ # Log CUDA memory info if available in compact format
901
  if torch.cuda.is_available():
902
+ memory_info = []
903
  for i in range(torch.cuda.device_count()):
904
+ allocated = torch.cuda.memory_allocated(i) / 1024**2
905
+ reserved = torch.cuda.memory_reserved(i) / 1024**2
906
+ max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
907
+ memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB (max: {max_mem:.1f}MB)")
908
+ logger.error(f"GPU memory at failure: {', '.join(memory_info)}")
909
  raise
910
 
911
  except Exception as e:
temp_function_fixes.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def format_phi_chat(messages, dataset_config):
2
+ """Format messages according to phi-4's chat template and dataset config."""
3
+ formatted_chat = ""
4
+
5
+ # Get role templates from config
6
+ roles = dataset_config.get("data_formatting", {}).get("roles", {
7
+ "system": "System: {content}\n\n",
8
+ "human": "Human: {content}\n\n",
9
+ "user": "Human: {content}\n\n",
10
+ "assistant": "Assistant: {content}\n\n"
11
+ })
12
+
13
+ # Handle research introduction metadata first
14
+ metadata = next((msg for msg in messages if isinstance(msg, dict) and
15
+ "[RESEARCH INTRODUCTION]" in msg.get("content", "")), None)
16
+ if metadata:
17
+ system_template = roles.get("system", "System: {content}\n\n")
18
+ formatted_chat = system_template.format(content=metadata['content'])
19
+ messages = [msg for msg in messages if msg != metadata]
20
+
21
+ # Process remaining messages
22
+ for message in messages:
23
+ if not isinstance(message, dict) or "content" not in message:
24
+ logger.warning(f"Skipping invalid message format: {message}")
25
+ continue
26
+
27
+ role = message.get("role", "").lower()
28
+ content = message.get("content", "")
29
+
30
+ # Format based on role
31
+ if role == "human" or role == "user":
32
+ template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
33
+ formatted_chat += template.format(content=content)
34
+ elif role == "assistant" or role == "bot":
35
+ template = roles.get("assistant", "Assistant: {content}\n\n")
36
+ formatted_chat += template.format(content=content)
37
+ elif role == "system":
38
+ # For system messages, prepend them
39
+ template = roles.get("system", "System: {content}\n\n")
40
+ formatted_chat = template.format(content=content) + formatted_chat
41
+ else:
42
+ # Default to system for unknown roles
43
+ logger.warning(f"Unknown role '{role}' - treating as system message")
44
+ template = roles.get("system", "System: {content}\n\n")
45
+ formatted_chat += template.format(content=content)
46
+
47
+ return formatted_chat.strip()
48
+
49
+ class SimpleDataCollator:
50
+ def __init__(self, tokenizer, dataset_config):
51
+ self.tokenizer = tokenizer
52
+ self.dataset_config = dataset_config
53
+ self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
54
+ self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
55
+ self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
56
+ logger.info(f"SimpleDataCollator initialized - using pre-audited dataset with max_seq_length={self.max_seq_length}")
57
+ logger.info("Using exact dataset structure without reformatting")
58
+
59
+ # Check if we're on GPU
60
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
61
+ logger.info(f"SimpleDataCollator using device: {self.device}")
62
+
63
+ def __call__(self, features):
64
+ """Process examples preserving exact JSONL structure"""
65
+ batch = {"input_ids": [], "attention_mask": [], "labels": []}
66
+
67
+ for example in features:
68
+ try:
69
+ # Get ID
70
+ paper_id = example.get("id", "")
71
+
72
+ # Get conversations - these should already contain role and content
73
+ conversations = example.get("conversations", [])
74
+ if not conversations:
75
+ self.stats["skipped"] += 1
76
+ continue
77
+
78
+ # Directly use the conversations array as input to the model's chat template
79
+ # This preserves the exact structure with roles and content as they are
80
+ try:
81
+ # Let tokenizer handle the content with the model's chat template
82
+ inputs = self.tokenizer.apply_chat_template(
83
+ conversations,
84
+ return_tensors=None,
85
+ add_generation_prompt=False
86
+ )
87
+ except Exception as chat_error:
88
+ # Fallback if apply_chat_template fails
89
+ logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)[:100]}")
90
+
91
+ # Create a basic representation of the conversation
92
+ conversation_text = ""
93
+ for msg in conversations:
94
+ if isinstance(msg, dict) and 'content' in msg:
95
+ conversation_text += msg.get('content', '') + "\n\n"
96
+
97
+ # Basic tokenization
98
+ inputs = self.tokenizer(
99
+ conversation_text,
100
+ add_special_tokens=True,
101
+ return_tensors=None
102
+ )
103
+
104
+ # Apply length cap if needed (shouldn't be necessary for pre-audited data)
105
+ if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
106
+ logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
107
+ inputs = inputs[:self.max_seq_length]
108
+
109
+ # Create attention mask (1 for all tokens)
110
+ attention_mask = [1] * len(inputs)
111
+
112
+ if len(inputs) > 0:
113
+ # For causal language modeling, labels are the same as inputs
114
+ labels = inputs.copy()
115
+
116
+ batch["input_ids"].append(inputs)
117
+ batch["attention_mask"].append(attention_mask)
118
+ batch["labels"].append(labels)
119
+
120
+ self.stats["processed"] += 1
121
+ self.stats["total_tokens"] += len(inputs)
122
+
123
+ # Debug logging for first few examples
124
+ log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
125
+ if self.stats["processed"] <= log_samples:
126
+ logger.info(f"Example {self.stats['processed']}:")
127
+ logger.info(f"Paper ID: {paper_id}")
128
+ logger.info(f"Token count: {len(inputs)}")
129
+ logger.info(f"Conversation entries: {len(conversations)}")
130
+ else:
131
+ self.stats["skipped"] += 1
132
+ except Exception as e:
133
+ logger.warning(f"Error processing example: {str(e)[:100]}...")
134
+ logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
135
+ self.stats["skipped"] += 1
136
+ continue
137
+
138
+ if not batch["input_ids"]:
139
+ logger.warning("Empty batch, returning dummy tensors")
140
+ return {
141
+ "input_ids": torch.zeros((1, 1), dtype=torch.long),
142
+ "attention_mask": torch.zeros((1, 1), dtype=torch.long),
143
+ "labels": torch.zeros((1, 1), dtype=torch.long)
144
+ }
145
+
146
+ # Pad the batch
147
+ max_length = max(len(ids) for ids in batch["input_ids"])
148
+
149
+ for i in range(len(batch["input_ids"])):
150
+ padding_length = max_length - len(batch["input_ids"][i])
151
+ if padding_length > 0:
152
+ batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
153
+ batch["attention_mask"][i].extend([0] * padding_length)
154
+ batch["labels"][i].extend([-100] * padding_length)
155
+
156
+ # Convert to tensors
157
+ batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
158
+
159
+ # Log stats periodically
160
+ log_interval = self.dataset_config.get("validation", {}).get("log_interval", 100)
161
+ if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
162
+ logger.info(f"Data collator stats: processed={self.stats['processed']}, "
163
+ f"skipped={self.stats['skipped']}, "
164
+ f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}")
165
+
166
+ return batch
167
+
168
+ class LoggingCallback(TrainerCallback):
169
+ def __init__(self):
170
+ self.last_log_time = time.time()
171
+ self.last_memory_log_time = time.time()
172
+
173
+ def on_step_end(self, args, state, control, **kwargs):
174
+ # Log every 50 steps or every 5 minutes, whichever comes first
175
+ current_time = time.time()
176
+
177
+ # Log loss every 50 steps or 5 minutes
178
+ if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
179
+ if state.log_history:
180
+ loss = state.log_history[-1].get('loss', 'N/A')
181
+ # Use simple formatting for better HF Space log compatibility
182
+ log_info(f"Step {state.global_step}: Loss {loss}")
183
+ else:
184
+ log_info(f"Step {state.global_step}: No loss data available")
185
+ self.last_log_time = current_time
186
+
187
+ # Log memory usage every 15 minutes
188
+ if current_time - self.last_memory_log_time > 900: # 15 minutes
189
+ if torch.cuda.is_available():
190
+ memory_info = []
191
+ for i in range(torch.cuda.device_count()):
192
+ allocated = torch.cuda.memory_allocated(i) / 1024**2
193
+ reserved = torch.cuda.memory_reserved(i) / 1024**2
194
+ memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB")
195
+
196
+ # Log in compact format for better visibility
197
+ log_info(f"Memory usage - {', '.join(memory_info)}")
198
+ self.last_memory_log_time = current_time
199
+
200
+ def on_train_begin(self, args, state, control, **kwargs):
201
+ log_info("=== Training is starting ===")
202
+
203
+ # Log important training parameters for visibility
204
+ log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
205
+ log_info(f"Learning rate: {args.learning_rate}")
206
+ log_info(f"Epochs: {args.num_train_epochs}")
207
+
208
+ # Log memory information in compact format
209
+ if torch.cuda.is_available():
210
+ memory_info = []
211
+ for i in range(torch.cuda.device_count()):
212
+ allocated = torch.cuda.memory_allocated(i) / 1024**2
213
+ max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
214
+ memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
215
+
216
+ log_info(f"Initial memory usage - {', '.join(memory_info)}")
217
+
218
+ def on_train_end(self, args, state, control, **kwargs):
219
+ log_info("=== Training completed ===")
220
+ if torch.cuda.is_available():
221
+ memory_info = []
222
+ for i in range(torch.cuda.device_count()):
223
+ allocated = torch.cuda.memory_allocated(i) / 1024**2
224
+ max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
225
+ memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
226
+
227
+ log_info(f"Final memory usage - {', '.join(memory_info)}")
228
+
229
+ log_info(f"Total steps: {state.global_step}")
230
+ log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
transformers_config.json CHANGED
@@ -13,8 +13,8 @@
13
  },
14
 
15
  "training": {
16
- "per_device_train_batch_size": 24,
17
- "gradient_accumulation_steps": 2,
18
  "learning_rate": 2e-5,
19
  "num_train_epochs": 3,
20
  "max_steps": -1,
@@ -67,7 +67,7 @@
67
  "offload_params": false
68
  },
69
  "ddp_find_unused_parameters": false,
70
- "dataloader_num_workers": 4
71
  },
72
 
73
  "logging": {
 
13
  },
14
 
15
  "training": {
16
+ "per_device_train_batch_size": 16,
17
+ "gradient_accumulation_steps": 3,
18
  "learning_rate": 2e-5,
19
  "num_train_epochs": 3,
20
  "max_steps": -1,
 
67
  "offload_params": false
68
  },
69
  "ddp_find_unused_parameters": false,
70
+ "dataloader_num_workers": 2
71
  },
72
 
73
  "logging": {