hf-train-frontend

Runtime error

App Files Files Community

George-API commited on Mar 9

Commit

9f8478c

verified ·

1 Parent(s): 356ee13

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

run_transformers_training.py +32 -42
transformers_config.json +5 -2

run_transformers_training.py CHANGED Viewed

@@ -7,6 +7,7 @@ import json
 import argparse
 import logging
 from datetime import datetime
 # Import Unsloth first, before other ML imports
 try:
@@ -618,61 +619,50 @@ def main():
         # Simple logging callback
         class LoggingCallback(TrainerCallback):
             def __init__(self):
-                self.last_log_time = datetime.now()
-                self.training_start_time = datetime.now()
             def on_step_end(self, args, state, control, **kwargs):
                 # Log every 50 steps or every 5 minutes, whichever comes first
-                current_time = datetime.now()
-                time_diff = (current_time - self.last_log_time).total_seconds()
-                elapsed_time = (current_time - self.training_start_time).total_seconds() / 60  # in minutes
-                if state.global_step % 50 == 0 or time_diff > 300:  # 300 seconds = 5 minutes
-                    loss = state.log_history[-1]['loss'] if state.log_history else 'N/A'
-                    lr = state.log_history[-1]['learning_rate'] if state.log_history else 'N/A'
-                    if isinstance(loss, float):
-                        loss_str = f"{loss:.4f}"
-                    else:
-                        loss_str = str(loss)
-                    if isinstance(lr, float):
-                        lr_str = f"{lr:.8f}"
-                    else:
-                        lr_str = str(lr)
-                    logger.info(f"Step: {state.global_step} | Loss: {loss_str} | LR: {lr_str} | Elapsed: {elapsed_time:.2f} min")
                     self.last_log_time = current_time
         # Set up training arguments
         logger.info("Setting up training arguments")
         training_args = TrainingArguments(
-            output_dir=model_config.get("output_dir", "./results"),
-            num_train_epochs=model_config.get("num_train_epochs", 3),
-            per_device_train_batch_size=model_config.get("per_device_train_batch_size", 4),  # Use config value, can be > 1
-            gradient_accumulation_steps=model_config.get("gradient_accumulation_steps", 8),
-            learning_rate=model_config.get("learning_rate", 5e-5),
-            weight_decay=model_config.get("weight_decay", 0.01),
-            warmup_ratio=model_config.get("warmup_ratio", 0.1),
-            lr_scheduler_type=model_config.get("lr_scheduler_type", "cosine"),
-            logging_steps=model_config.get("logging_steps", 10),
-            save_strategy=model_config.get("save_strategy", "steps"),  # Updated to use steps by default
-            save_steps=model_config.get("save_steps", 100),  # Save every 100 steps by default
-            save_total_limit=model_config.get("save_total_limit", 3),  # Keep last 3 checkpoints
-            fp16=model_config.get("fp16", True),
-            bf16=model_config.get("bf16", False),
-            max_grad_norm=model_config.get("max_grad_norm", 1.0),
-            push_to_hub=model_config.get("push_to_hub", False),
-            hub_model_id=model_config.get("hub_model_id", None),
             hub_token=os.environ.get("HF_TOKEN", None),
             report_to="tensorboard",
-            remove_unused_columns=False,  # Keep the conversations column
-            gradient_checkpointing=model_config.get("gradient_checkpointing", True),  # Enable gradient checkpointing
             dataloader_pin_memory=False,  # Reduce memory usage
-            optim=model_config.get("optim", "adamw_torch"),
             ddp_find_unused_parameters=False,  # Improve distributed training efficiency
             dataloader_drop_last=False,  # Process all examples
-            dataloader_num_workers=0,  # Sequential data loading
         )
         # Create a sequential sampler to ensure dataset is processed in order

 import argparse
 import logging
 from datetime import datetime
+import time
 # Import Unsloth first, before other ML imports
 try:
         # Simple logging callback
         class LoggingCallback(TrainerCallback):
             def __init__(self):
+                self.last_log_time = time.time()
             def on_step_end(self, args, state, control, **kwargs):
                 # Log every 50 steps or every 5 minutes, whichever comes first
+                current_time = time.time()
+                if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
+                    logger.info(f"Step {state.global_step}: Loss {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")
                     self.last_log_time = current_time
+        # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
+        use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
+        use_fp16 = model_config.get("fp16", False) and not use_bf16  # Only use fp16 if bf16 is not set
+        logger.info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
         # Set up training arguments
         logger.info("Setting up training arguments")
         training_args = TrainingArguments(
+            output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
+            num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
+            per_device_train_batch_size=model_config.get("training", {}).get("per_device_train_batch_size", 24),
+            gradient_accumulation_steps=model_config.get("training", {}).get("gradient_accumulation_steps", 2),
+            learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
+            weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
+            warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
+            lr_scheduler_type=model_config.get("training", {}).get("lr_scheduler_type", "cosine"),
+            logging_steps=model_config.get("training", {}).get("logging_steps", 10),
+            save_strategy=model_config.get("checkpointing", {}).get("save_strategy", "steps"),
+            save_steps=model_config.get("checkpointing", {}).get("save_steps", 100),
+            save_total_limit=model_config.get("checkpointing", {}).get("save_total_limit", 3),
+            fp16=use_fp16,
+            bf16=use_bf16,
+            max_grad_norm=model_config.get("training", {}).get("max_grad_norm", 1.0),
+            push_to_hub=model_config.get("huggingface_hub", {}).get("push_to_hub", False),
+            hub_model_id=model_config.get("huggingface_hub", {}).get("hub_model_id", None),
             hub_token=os.environ.get("HF_TOKEN", None),
             report_to="tensorboard",
+            remove_unused_columns=False,  # Keep all columns
+            gradient_checkpointing=model_config.get("training", {}).get("gradient_checkpointing", True),
             dataloader_pin_memory=False,  # Reduce memory usage
+            optim=model_config.get("training", {}).get("optim", "adamw_torch"),
             ddp_find_unused_parameters=False,  # Improve distributed training efficiency
             dataloader_drop_last=False,  # Process all examples
+            dataloader_num_workers=4,  # Sequential data loading
         )
         # Create a sequential sampler to ensure dataset is processed in order

transformers_config.json CHANGED Viewed

@@ -29,7 +29,9 @@
     "warmup_ratio": 0.05,
     "weight_decay": 0.01,
     "max_grad_norm": 1.0,
-    "neftune_noise_alpha": 5
   },
   "checkpointing": {
@@ -83,5 +85,6 @@
   "model_revision": "main",
   "use_flash_attention": true,
   "torch_dtype": "bfloat16",
-  "bf16": true
 }

     "warmup_ratio": 0.05,
     "weight_decay": 0.01,
     "max_grad_norm": 1.0,
+    "neftune_noise_alpha": 5,
+    "fp16": false,
+    "bf16": true
   },
   "checkpointing": {
   "model_revision": "main",
   "use_flash_attention": true,
   "torch_dtype": "bfloat16",
+  "bf16": true,
+  "fp16": false
 }