George-API commited on
Commit
9f8478c
·
verified ·
1 Parent(s): 356ee13

Upload folder using huggingface_hub

Browse files
run_transformers_training.py CHANGED
@@ -7,6 +7,7 @@ import json
7
  import argparse
8
  import logging
9
  from datetime import datetime
 
10
 
11
  # Import Unsloth first, before other ML imports
12
  try:
@@ -618,61 +619,50 @@ def main():
618
  # Simple logging callback
619
  class LoggingCallback(TrainerCallback):
620
  def __init__(self):
621
- self.last_log_time = datetime.now()
622
- self.training_start_time = datetime.now()
623
 
624
  def on_step_end(self, args, state, control, **kwargs):
625
  # Log every 50 steps or every 5 minutes, whichever comes first
626
- current_time = datetime.now()
627
- time_diff = (current_time - self.last_log_time).total_seconds()
628
- elapsed_time = (current_time - self.training_start_time).total_seconds() / 60 # in minutes
629
-
630
- if state.global_step % 50 == 0 or time_diff > 300: # 300 seconds = 5 minutes
631
- loss = state.log_history[-1]['loss'] if state.log_history else 'N/A'
632
- lr = state.log_history[-1]['learning_rate'] if state.log_history else 'N/A'
633
-
634
- if isinstance(loss, float):
635
- loss_str = f"{loss:.4f}"
636
- else:
637
- loss_str = str(loss)
638
-
639
- if isinstance(lr, float):
640
- lr_str = f"{lr:.8f}"
641
- else:
642
- lr_str = str(lr)
643
-
644
- logger.info(f"Step: {state.global_step} | Loss: {loss_str} | LR: {lr_str} | Elapsed: {elapsed_time:.2f} min")
645
  self.last_log_time = current_time
646
 
 
 
 
 
 
 
647
  # Set up training arguments
648
  logger.info("Setting up training arguments")
649
  training_args = TrainingArguments(
650
- output_dir=model_config.get("output_dir", "./results"),
651
- num_train_epochs=model_config.get("num_train_epochs", 3),
652
- per_device_train_batch_size=model_config.get("per_device_train_batch_size", 4), # Use config value, can be > 1
653
- gradient_accumulation_steps=model_config.get("gradient_accumulation_steps", 8),
654
- learning_rate=model_config.get("learning_rate", 5e-5),
655
- weight_decay=model_config.get("weight_decay", 0.01),
656
- warmup_ratio=model_config.get("warmup_ratio", 0.1),
657
- lr_scheduler_type=model_config.get("lr_scheduler_type", "cosine"),
658
- logging_steps=model_config.get("logging_steps", 10),
659
- save_strategy=model_config.get("save_strategy", "steps"), # Updated to use steps by default
660
- save_steps=model_config.get("save_steps", 100), # Save every 100 steps by default
661
- save_total_limit=model_config.get("save_total_limit", 3), # Keep last 3 checkpoints
662
- fp16=model_config.get("fp16", True),
663
- bf16=model_config.get("bf16", False),
664
- max_grad_norm=model_config.get("max_grad_norm", 1.0),
665
- push_to_hub=model_config.get("push_to_hub", False),
666
- hub_model_id=model_config.get("hub_model_id", None),
667
  hub_token=os.environ.get("HF_TOKEN", None),
668
  report_to="tensorboard",
669
- remove_unused_columns=False, # Keep the conversations column
670
- gradient_checkpointing=model_config.get("gradient_checkpointing", True), # Enable gradient checkpointing
671
  dataloader_pin_memory=False, # Reduce memory usage
672
- optim=model_config.get("optim", "adamw_torch"),
673
  ddp_find_unused_parameters=False, # Improve distributed training efficiency
674
  dataloader_drop_last=False, # Process all examples
675
- dataloader_num_workers=0, # Sequential data loading
676
  )
677
 
678
  # Create a sequential sampler to ensure dataset is processed in order
 
7
  import argparse
8
  import logging
9
  from datetime import datetime
10
+ import time
11
 
12
  # Import Unsloth first, before other ML imports
13
  try:
 
619
  # Simple logging callback
620
  class LoggingCallback(TrainerCallback):
621
  def __init__(self):
622
+ self.last_log_time = time.time()
 
623
 
624
  def on_step_end(self, args, state, control, **kwargs):
625
  # Log every 50 steps or every 5 minutes, whichever comes first
626
+ current_time = time.time()
627
+ if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
628
+ logger.info(f"Step {state.global_step}: Loss {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
629
  self.last_log_time = current_time
630
 
631
+ # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
632
+ use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
633
+ use_fp16 = model_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
634
+
635
+ logger.info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
636
+
637
  # Set up training arguments
638
  logger.info("Setting up training arguments")
639
  training_args = TrainingArguments(
640
+ output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
641
+ num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
642
+ per_device_train_batch_size=model_config.get("training", {}).get("per_device_train_batch_size", 24),
643
+ gradient_accumulation_steps=model_config.get("training", {}).get("gradient_accumulation_steps", 2),
644
+ learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
645
+ weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
646
+ warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
647
+ lr_scheduler_type=model_config.get("training", {}).get("lr_scheduler_type", "cosine"),
648
+ logging_steps=model_config.get("training", {}).get("logging_steps", 10),
649
+ save_strategy=model_config.get("checkpointing", {}).get("save_strategy", "steps"),
650
+ save_steps=model_config.get("checkpointing", {}).get("save_steps", 100),
651
+ save_total_limit=model_config.get("checkpointing", {}).get("save_total_limit", 3),
652
+ fp16=use_fp16,
653
+ bf16=use_bf16,
654
+ max_grad_norm=model_config.get("training", {}).get("max_grad_norm", 1.0),
655
+ push_to_hub=model_config.get("huggingface_hub", {}).get("push_to_hub", False),
656
+ hub_model_id=model_config.get("huggingface_hub", {}).get("hub_model_id", None),
657
  hub_token=os.environ.get("HF_TOKEN", None),
658
  report_to="tensorboard",
659
+ remove_unused_columns=False, # Keep all columns
660
+ gradient_checkpointing=model_config.get("training", {}).get("gradient_checkpointing", True),
661
  dataloader_pin_memory=False, # Reduce memory usage
662
+ optim=model_config.get("training", {}).get("optim", "adamw_torch"),
663
  ddp_find_unused_parameters=False, # Improve distributed training efficiency
664
  dataloader_drop_last=False, # Process all examples
665
+ dataloader_num_workers=4, # Sequential data loading
666
  )
667
 
668
  # Create a sequential sampler to ensure dataset is processed in order
transformers_config.json CHANGED
@@ -29,7 +29,9 @@
29
  "warmup_ratio": 0.05,
30
  "weight_decay": 0.01,
31
  "max_grad_norm": 1.0,
32
- "neftune_noise_alpha": 5
 
 
33
  },
34
 
35
  "checkpointing": {
@@ -83,5 +85,6 @@
83
  "model_revision": "main",
84
  "use_flash_attention": true,
85
  "torch_dtype": "bfloat16",
86
- "bf16": true
 
87
  }
 
29
  "warmup_ratio": 0.05,
30
  "weight_decay": 0.01,
31
  "max_grad_norm": 1.0,
32
+ "neftune_noise_alpha": 5,
33
+ "fp16": false,
34
+ "bf16": true
35
  },
36
 
37
  "checkpointing": {
 
85
  "model_revision": "main",
86
  "use_flash_attention": true,
87
  "torch_dtype": "bfloat16",
88
+ "bf16": true,
89
+ "fp16": false
90
  }