Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- run_transformers_training.py +32 -42
- transformers_config.json +5 -2
run_transformers_training.py
CHANGED
@@ -7,6 +7,7 @@ import json
|
|
7 |
import argparse
|
8 |
import logging
|
9 |
from datetime import datetime
|
|
|
10 |
|
11 |
# Import Unsloth first, before other ML imports
|
12 |
try:
|
@@ -618,61 +619,50 @@ def main():
|
|
618 |
# Simple logging callback
|
619 |
class LoggingCallback(TrainerCallback):
|
620 |
def __init__(self):
|
621 |
-
self.last_log_time =
|
622 |
-
self.training_start_time = datetime.now()
|
623 |
|
624 |
def on_step_end(self, args, state, control, **kwargs):
|
625 |
# Log every 50 steps or every 5 minutes, whichever comes first
|
626 |
-
current_time =
|
627 |
-
|
628 |
-
|
629 |
-
|
630 |
-
if state.global_step % 50 == 0 or time_diff > 300: # 300 seconds = 5 minutes
|
631 |
-
loss = state.log_history[-1]['loss'] if state.log_history else 'N/A'
|
632 |
-
lr = state.log_history[-1]['learning_rate'] if state.log_history else 'N/A'
|
633 |
-
|
634 |
-
if isinstance(loss, float):
|
635 |
-
loss_str = f"{loss:.4f}"
|
636 |
-
else:
|
637 |
-
loss_str = str(loss)
|
638 |
-
|
639 |
-
if isinstance(lr, float):
|
640 |
-
lr_str = f"{lr:.8f}"
|
641 |
-
else:
|
642 |
-
lr_str = str(lr)
|
643 |
-
|
644 |
-
logger.info(f"Step: {state.global_step} | Loss: {loss_str} | LR: {lr_str} | Elapsed: {elapsed_time:.2f} min")
|
645 |
self.last_log_time = current_time
|
646 |
|
|
|
|
|
|
|
|
|
|
|
|
|
647 |
# Set up training arguments
|
648 |
logger.info("Setting up training arguments")
|
649 |
training_args = TrainingArguments(
|
650 |
-
output_dir=model_config.get("output_dir", "./results"),
|
651 |
-
num_train_epochs=model_config.get("num_train_epochs", 3),
|
652 |
-
per_device_train_batch_size=model_config.get("
|
653 |
-
gradient_accumulation_steps=model_config.get("gradient_accumulation_steps",
|
654 |
-
learning_rate=model_config.get("learning_rate",
|
655 |
-
weight_decay=model_config.get("weight_decay", 0.01),
|
656 |
-
warmup_ratio=model_config.get("warmup_ratio", 0.
|
657 |
-
lr_scheduler_type=model_config.get("lr_scheduler_type", "cosine"),
|
658 |
-
logging_steps=model_config.get("logging_steps", 10),
|
659 |
-
save_strategy=model_config.get("save_strategy", "steps"),
|
660 |
-
save_steps=model_config.get("save_steps", 100),
|
661 |
-
save_total_limit=model_config.get("save_total_limit", 3),
|
662 |
-
fp16=
|
663 |
-
bf16=
|
664 |
-
max_grad_norm=model_config.get("max_grad_norm", 1.0),
|
665 |
-
push_to_hub=model_config.get("push_to_hub", False),
|
666 |
-
hub_model_id=model_config.get("hub_model_id", None),
|
667 |
hub_token=os.environ.get("HF_TOKEN", None),
|
668 |
report_to="tensorboard",
|
669 |
-
remove_unused_columns=False, # Keep
|
670 |
-
gradient_checkpointing=model_config.get("gradient_checkpointing", True),
|
671 |
dataloader_pin_memory=False, # Reduce memory usage
|
672 |
-
optim=model_config.get("optim", "adamw_torch"),
|
673 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
674 |
dataloader_drop_last=False, # Process all examples
|
675 |
-
dataloader_num_workers=
|
676 |
)
|
677 |
|
678 |
# Create a sequential sampler to ensure dataset is processed in order
|
|
|
7 |
import argparse
|
8 |
import logging
|
9 |
from datetime import datetime
|
10 |
+
import time
|
11 |
|
12 |
# Import Unsloth first, before other ML imports
|
13 |
try:
|
|
|
619 |
# Simple logging callback
|
620 |
class LoggingCallback(TrainerCallback):
|
621 |
def __init__(self):
|
622 |
+
self.last_log_time = time.time()
|
|
|
623 |
|
624 |
def on_step_end(self, args, state, control, **kwargs):
|
625 |
# Log every 50 steps or every 5 minutes, whichever comes first
|
626 |
+
current_time = time.time()
|
627 |
+
if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
|
628 |
+
logger.info(f"Step {state.global_step}: Loss {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
629 |
self.last_log_time = current_time
|
630 |
|
631 |
+
# Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
|
632 |
+
use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
|
633 |
+
use_fp16 = model_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
|
634 |
+
|
635 |
+
logger.info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
|
636 |
+
|
637 |
# Set up training arguments
|
638 |
logger.info("Setting up training arguments")
|
639 |
training_args = TrainingArguments(
|
640 |
+
output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
|
641 |
+
num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
|
642 |
+
per_device_train_batch_size=model_config.get("training", {}).get("per_device_train_batch_size", 24),
|
643 |
+
gradient_accumulation_steps=model_config.get("training", {}).get("gradient_accumulation_steps", 2),
|
644 |
+
learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
|
645 |
+
weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
|
646 |
+
warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
|
647 |
+
lr_scheduler_type=model_config.get("training", {}).get("lr_scheduler_type", "cosine"),
|
648 |
+
logging_steps=model_config.get("training", {}).get("logging_steps", 10),
|
649 |
+
save_strategy=model_config.get("checkpointing", {}).get("save_strategy", "steps"),
|
650 |
+
save_steps=model_config.get("checkpointing", {}).get("save_steps", 100),
|
651 |
+
save_total_limit=model_config.get("checkpointing", {}).get("save_total_limit", 3),
|
652 |
+
fp16=use_fp16,
|
653 |
+
bf16=use_bf16,
|
654 |
+
max_grad_norm=model_config.get("training", {}).get("max_grad_norm", 1.0),
|
655 |
+
push_to_hub=model_config.get("huggingface_hub", {}).get("push_to_hub", False),
|
656 |
+
hub_model_id=model_config.get("huggingface_hub", {}).get("hub_model_id", None),
|
657 |
hub_token=os.environ.get("HF_TOKEN", None),
|
658 |
report_to="tensorboard",
|
659 |
+
remove_unused_columns=False, # Keep all columns
|
660 |
+
gradient_checkpointing=model_config.get("training", {}).get("gradient_checkpointing", True),
|
661 |
dataloader_pin_memory=False, # Reduce memory usage
|
662 |
+
optim=model_config.get("training", {}).get("optim", "adamw_torch"),
|
663 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
664 |
dataloader_drop_last=False, # Process all examples
|
665 |
+
dataloader_num_workers=4, # Sequential data loading
|
666 |
)
|
667 |
|
668 |
# Create a sequential sampler to ensure dataset is processed in order
|
transformers_config.json
CHANGED
@@ -29,7 +29,9 @@
|
|
29 |
"warmup_ratio": 0.05,
|
30 |
"weight_decay": 0.01,
|
31 |
"max_grad_norm": 1.0,
|
32 |
-
"neftune_noise_alpha": 5
|
|
|
|
|
33 |
},
|
34 |
|
35 |
"checkpointing": {
|
@@ -83,5 +85,6 @@
|
|
83 |
"model_revision": "main",
|
84 |
"use_flash_attention": true,
|
85 |
"torch_dtype": "bfloat16",
|
86 |
-
"bf16": true
|
|
|
87 |
}
|
|
|
29 |
"warmup_ratio": 0.05,
|
30 |
"weight_decay": 0.01,
|
31 |
"max_grad_norm": 1.0,
|
32 |
+
"neftune_noise_alpha": 5,
|
33 |
+
"fp16": false,
|
34 |
+
"bf16": true
|
35 |
},
|
36 |
|
37 |
"checkpointing": {
|
|
|
85 |
"model_revision": "main",
|
86 |
"use_flash_attention": true,
|
87 |
"torch_dtype": "bfloat16",
|
88 |
+
"bf16": true,
|
89 |
+
"fp16": false
|
90 |
}
|