Spaces:
Runtime error
Runtime error
Upload folder using huggingface_hub
Browse files- hardware_config.json +55 -55
- run_transformers_training.py +210 -53
- temp_function_fixes.py +230 -0
- transformers_config.json +3 -3
hardware_config.json
CHANGED
@@ -1,56 +1,56 @@
|
|
1 |
-
{
|
2 |
-
"hardware_name": "4xL4",
|
3 |
-
"specs": {
|
4 |
-
"gpu_count": 4,
|
5 |
-
"gpu_type": "L4",
|
6 |
-
"vram_per_gpu": 24,
|
7 |
-
"total_vram": 96,
|
8 |
-
"vcpu_count": 48,
|
9 |
-
"ram": 186
|
10 |
-
},
|
11 |
-
"training_optimizations": {
|
12 |
-
"per_device_batch_size": 24,
|
13 |
-
"gradient_accumulation_steps": 2,
|
14 |
-
"effective_batch_size": 192,
|
15 |
-
"memory_optimizations": {
|
16 |
-
"use_gradient_checkpointing": true,
|
17 |
-
"pin_memory": true,
|
18 |
-
"num_workers": 4,
|
19 |
-
"use_flash_attention": true
|
20 |
-
},
|
21 |
-
"distributed_settings": {
|
22 |
-
"device_map": "auto",
|
23 |
-
"ddp_find_unused_parameters": false,
|
24 |
-
"use_fsdp": true,
|
25 |
-
"fsdp_config": {
|
26 |
-
"sharding_strategy": "FULL_SHARD",
|
27 |
-
"mixed_precision": "BF16",
|
28 |
-
"activation_checkpointing": true
|
29 |
-
}
|
30 |
-
}
|
31 |
-
},
|
32 |
-
"memory_breakdown": {
|
33 |
-
"model_size": "~3.5GB (pre-quantized 4-bit)",
|
34 |
-
"optimizer_states": "~1GB",
|
35 |
-
"batch_memory_per_gpu": "~3GB",
|
36 |
-
"peak_memory_estimate": "~18GB",
|
37 |
-
"safe_headroom": "~6GB"
|
38 |
-
},
|
39 |
-
"compute_environment": "L4_CLOUD",
|
40 |
-
"distributed_type": "FSDP",
|
41 |
-
"mixed_precision": "bf16",
|
42 |
-
"num_gpus": 4,
|
43 |
-
"training_parameters": {
|
44 |
-
"per_device_train_batch_size": 24,
|
45 |
-
"gradient_accumulation_steps": 2,
|
46 |
-
"dataloader_num_workers": 4,
|
47 |
-
"dataloader_pin_memory": true,
|
48 |
-
"gradient_checkpointing": true,
|
49 |
-
"max_grad_norm": 1.0
|
50 |
-
},
|
51 |
-
"memory_optimization": {
|
52 |
-
"offload_to_cpu": false,
|
53 |
-
"use_flash_attention": true,
|
54 |
-
"use_gradient_checkpointing": true
|
55 |
-
}
|
56 |
}
|
|
|
1 |
+
{
|
2 |
+
"hardware_name": "4xL4",
|
3 |
+
"specs": {
|
4 |
+
"gpu_count": 4,
|
5 |
+
"gpu_type": "L4",
|
6 |
+
"vram_per_gpu": 24,
|
7 |
+
"total_vram": 96,
|
8 |
+
"vcpu_count": 48,
|
9 |
+
"ram": 186
|
10 |
+
},
|
11 |
+
"training_optimizations": {
|
12 |
+
"per_device_batch_size": 24,
|
13 |
+
"gradient_accumulation_steps": 2,
|
14 |
+
"effective_batch_size": 192,
|
15 |
+
"memory_optimizations": {
|
16 |
+
"use_gradient_checkpointing": true,
|
17 |
+
"pin_memory": true,
|
18 |
+
"num_workers": 4,
|
19 |
+
"use_flash_attention": true
|
20 |
+
},
|
21 |
+
"distributed_settings": {
|
22 |
+
"device_map": "auto",
|
23 |
+
"ddp_find_unused_parameters": false,
|
24 |
+
"use_fsdp": true,
|
25 |
+
"fsdp_config": {
|
26 |
+
"sharding_strategy": "FULL_SHARD",
|
27 |
+
"mixed_precision": "BF16",
|
28 |
+
"activation_checkpointing": true
|
29 |
+
}
|
30 |
+
}
|
31 |
+
},
|
32 |
+
"memory_breakdown": {
|
33 |
+
"model_size": "~3.5GB (pre-quantized 4-bit)",
|
34 |
+
"optimizer_states": "~1GB",
|
35 |
+
"batch_memory_per_gpu": "~3GB",
|
36 |
+
"peak_memory_estimate": "~18GB",
|
37 |
+
"safe_headroom": "~6GB"
|
38 |
+
},
|
39 |
+
"compute_environment": "L4_CLOUD",
|
40 |
+
"distributed_type": "FSDP",
|
41 |
+
"mixed_precision": "bf16",
|
42 |
+
"num_gpus": 4,
|
43 |
+
"training_parameters": {
|
44 |
+
"per_device_train_batch_size": 24,
|
45 |
+
"gradient_accumulation_steps": 2,
|
46 |
+
"dataloader_num_workers": 4,
|
47 |
+
"dataloader_pin_memory": true,
|
48 |
+
"gradient_checkpointing": true,
|
49 |
+
"max_grad_norm": 1.0
|
50 |
+
},
|
51 |
+
"memory_optimization": {
|
52 |
+
"offload_to_cpu": false,
|
53 |
+
"use_flash_attention": true,
|
54 |
+
"use_gradient_checkpointing": true
|
55 |
+
}
|
56 |
}
|
run_transformers_training.py
CHANGED
@@ -39,6 +39,21 @@ logging.basicConfig(
|
|
39 |
)
|
40 |
logger = logging.getLogger(__name__)
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
# Check for BitsAndBytes
|
43 |
try:
|
44 |
from transformers import BitsAndBytesConfig
|
@@ -142,6 +157,7 @@ def load_model_and_tokenizer(config):
|
|
142 |
raise ValueError("Model name not found in configuration. Please check your transformers_config.json file.")
|
143 |
|
144 |
logger.info("Using Unsloth optimizations with pre-quantized model")
|
|
|
145 |
# Check for flash attention without importing it directly
|
146 |
use_flash_attention = config.get("use_flash_attention", True)
|
147 |
try:
|
@@ -153,11 +169,29 @@ def load_model_and_tokenizer(config):
|
|
153 |
|
154 |
# First detect if we have a GPU
|
155 |
if torch.cuda.is_available():
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
else:
|
159 |
logger.warning("No CUDA available, falling back to CPU")
|
160 |
device_map = {"": "cpu"} # Force CPU placement
|
|
|
161 |
|
162 |
# Set default dtype for better numerics
|
163 |
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
|
@@ -175,11 +209,15 @@ def load_model_and_tokenizer(config):
|
|
175 |
|
176 |
# Load model with proper error handling for out-of-memory
|
177 |
try:
|
|
|
|
|
|
|
178 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
179 |
model_name=model_name,
|
180 |
max_seq_length=config.get("max_seq_length", 2048) or config.get("tokenizer", {}).get("max_seq_length", 2048),
|
181 |
dtype=dtype,
|
182 |
device_map=device_map,
|
|
|
183 |
# Don't explicitly use flash attention config here, let Unsloth handle it
|
184 |
)
|
185 |
except RuntimeError as e:
|
@@ -355,7 +393,7 @@ def format_phi_chat(messages, dataset_config):
|
|
355 |
|
356 |
role = message.get("role", "").lower()
|
357 |
content = message.get("content", "")
|
358 |
-
|
359 |
# Format based on role
|
360 |
if role == "human" or role == "user":
|
361 |
template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
|
@@ -413,9 +451,9 @@ class SimpleDataCollator:
|
|
413 |
return_tensors=None,
|
414 |
add_generation_prompt=False
|
415 |
)
|
416 |
-
except:
|
417 |
# Fallback if apply_chat_template fails
|
418 |
-
logger.warning(f"Chat template application failed for example {paper_id}
|
419 |
|
420 |
# Create a basic representation of the conversation
|
421 |
conversation_text = ""
|
@@ -494,6 +532,70 @@ class SimpleDataCollator:
|
|
494 |
|
495 |
return batch
|
496 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
497 |
def check_dependencies():
|
498 |
"""Check if all required dependencies are installed."""
|
499 |
missing_packages = []
|
@@ -525,7 +627,7 @@ def check_dependencies():
|
|
525 |
|
526 |
def main():
|
527 |
# Set up logging
|
528 |
-
|
529 |
|
530 |
# Parse arguments
|
531 |
args = parse_args()
|
@@ -566,8 +668,9 @@ def main():
|
|
566 |
logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
|
567 |
return 1
|
568 |
|
569 |
-
|
570 |
-
|
|
|
571 |
|
572 |
# Extract specific configs
|
573 |
model_config = configs["transformers"]
|
@@ -582,11 +685,11 @@ def main():
|
|
582 |
|
583 |
if per_device_batch_size and model_config.get("training"):
|
584 |
model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
|
585 |
-
|
586 |
|
587 |
if gradient_accumulation and model_config.get("training"):
|
588 |
model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
|
589 |
-
|
590 |
|
591 |
# Apply memory optimizations
|
592 |
memory_opts = training_opts.get("memory_optimizations", {})
|
@@ -600,28 +703,39 @@ def main():
|
|
600 |
# Set random seed for reproducibility
|
601 |
seed = model_config.get("seed", 42)
|
602 |
set_seed(seed)
|
603 |
-
|
604 |
|
605 |
# Check CUDA and set environment variables for better memory management
|
606 |
if torch.cuda.is_available():
|
607 |
# Empty CUDA cache
|
608 |
torch.cuda.empty_cache()
|
609 |
-
|
610 |
-
|
611 |
-
|
|
|
|
|
|
|
612 |
for i in range(torch.cuda.device_count()):
|
613 |
-
|
614 |
-
|
615 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
616 |
|
617 |
try:
|
|
|
618 |
model, tokenizer = load_model_and_tokenizer(model_config)
|
619 |
-
|
620 |
|
621 |
# Load dataset with proper mapping
|
622 |
try:
|
|
|
623 |
dataset = load_dataset_with_mapping(dataset_config)
|
624 |
-
|
625 |
except Exception as e:
|
626 |
logger.error(f"Error loading dataset: {e}")
|
627 |
return 1
|
@@ -629,38 +743,64 @@ def main():
|
|
629 |
# Create data collator
|
630 |
data_collator = SimpleDataCollator(tokenizer, dataset_config)
|
631 |
|
632 |
-
# Simple logging callback
|
633 |
-
class LoggingCallback(TrainerCallback):
|
634 |
-
def __init__(self):
|
635 |
-
self.last_log_time = time.time()
|
636 |
-
|
637 |
-
def on_step_end(self, args, state, control, **kwargs):
|
638 |
-
# Log every 50 steps or every 5 minutes, whichever comes first
|
639 |
-
current_time = time.time()
|
640 |
-
if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
|
641 |
-
logger.info(f"Step {state.global_step}: Loss {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")
|
642 |
-
self.last_log_time = current_time
|
643 |
-
|
644 |
-
def on_train_begin(self, args, state, control, **kwargs):
|
645 |
-
logger.info("Training is starting...")
|
646 |
-
# Log memory information
|
647 |
-
if torch.cuda.is_available():
|
648 |
-
for i in range(torch.cuda.device_count()):
|
649 |
-
logger.info(f"GPU {i} Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
|
650 |
-
|
651 |
# Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
|
652 |
use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
|
653 |
use_fp16 = model_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
|
654 |
|
655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
656 |
|
657 |
# Set up training arguments
|
658 |
-
|
659 |
training_args = TrainingArguments(
|
660 |
output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
|
661 |
num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
|
662 |
-
per_device_train_batch_size=
|
663 |
-
gradient_accumulation_steps=
|
664 |
learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
|
665 |
weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
|
666 |
warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
|
@@ -682,15 +822,16 @@ def main():
|
|
682 |
optim=model_config.get("training", {}).get("optim", "adamw_torch"),
|
683 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
684 |
dataloader_drop_last=False, # Process all examples
|
685 |
-
dataloader_num_workers=
|
686 |
no_cuda=False if torch.cuda.is_available() else True, # Use CUDA if available
|
|
|
687 |
)
|
688 |
|
689 |
# Create sequential sampler to maintain original dataset order
|
690 |
sequential_sampler = torch.utils.data.SequentialSampler(dataset)
|
691 |
|
692 |
# Initialize trainer first
|
693 |
-
|
694 |
trainer = Trainer(
|
695 |
model=model,
|
696 |
args=training_args,
|
@@ -702,7 +843,7 @@ def main():
|
|
702 |
# Then override the get_train_dataloader method
|
703 |
def custom_get_train_dataloader():
|
704 |
"""Custom dataloader that preserves original dataset order"""
|
705 |
-
|
706 |
|
707 |
# Calculate batch size based on device availability
|
708 |
if getattr(training_args, "no_cuda", False):
|
@@ -710,7 +851,7 @@ def main():
|
|
710 |
else:
|
711 |
batch_size = max(training_args.per_device_train_batch_size * max(1, torch.cuda.device_count()), 1)
|
712 |
|
713 |
-
|
714 |
|
715 |
# Return DataLoader with sequential sampler
|
716 |
return torch.utils.data.DataLoader(
|
@@ -727,28 +868,44 @@ def main():
|
|
727 |
trainer.get_train_dataloader = custom_get_train_dataloader
|
728 |
|
729 |
# Start training
|
730 |
-
|
731 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
732 |
trainer.train()
|
733 |
-
|
734 |
|
735 |
# Save the final model
|
736 |
-
|
737 |
trainer.save_model()
|
|
|
738 |
|
739 |
# Push to hub if enabled
|
740 |
if model_config.get("huggingface_hub", {}).get("push_to_hub", False):
|
741 |
-
|
|
|
742 |
trainer.push_to_hub()
|
|
|
743 |
|
744 |
return 0
|
745 |
except Exception as e:
|
746 |
logger.error(f"Training failed with error: {str(e)}")
|
747 |
-
# Log CUDA memory info if available
|
748 |
if torch.cuda.is_available():
|
|
|
749 |
for i in range(torch.cuda.device_count()):
|
750 |
-
|
751 |
-
|
|
|
|
|
|
|
752 |
raise
|
753 |
|
754 |
except Exception as e:
|
|
|
39 |
)
|
40 |
logger = logging.getLogger(__name__)
|
41 |
|
42 |
+
# Set other loggers to WARNING to reduce noise and ensure our logs are visible
|
43 |
+
logging.getLogger("transformers").setLevel(logging.WARNING)
|
44 |
+
logging.getLogger("datasets").setLevel(logging.WARNING)
|
45 |
+
logging.getLogger("accelerate").setLevel(logging.WARNING)
|
46 |
+
logging.getLogger("torch").setLevel(logging.WARNING)
|
47 |
+
logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
|
48 |
+
|
49 |
+
# Define a clean logging function for HF Space compatibility
|
50 |
+
def log_info(message):
|
51 |
+
"""Log information in a format compatible with Hugging Face Spaces"""
|
52 |
+
# Just use the logger, but ensure consistent formatting
|
53 |
+
logger.info(message)
|
54 |
+
# Also ensure output is flushed immediately for streaming
|
55 |
+
sys.stdout.flush()
|
56 |
+
|
57 |
# Check for BitsAndBytes
|
58 |
try:
|
59 |
from transformers import BitsAndBytesConfig
|
|
|
157 |
raise ValueError("Model name not found in configuration. Please check your transformers_config.json file.")
|
158 |
|
159 |
logger.info("Using Unsloth optimizations with pre-quantized model")
|
160 |
+
|
161 |
# Check for flash attention without importing it directly
|
162 |
use_flash_attention = config.get("use_flash_attention", True)
|
163 |
try:
|
|
|
169 |
|
170 |
# First detect if we have a GPU
|
171 |
if torch.cuda.is_available():
|
172 |
+
gpu_count = torch.cuda.device_count()
|
173 |
+
logger.info(f"CUDA available, found {gpu_count} GPU(s)")
|
174 |
+
|
175 |
+
# Log GPU info
|
176 |
+
for i in range(gpu_count):
|
177 |
+
logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
178 |
+
logger.info(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
|
179 |
+
|
180 |
+
# Create an optimized device map for better balance
|
181 |
+
if gpu_count > 1:
|
182 |
+
logger.info(f"Creating balanced device map for {gpu_count} GPUs")
|
183 |
+
# Use auto mapping but with memory tracking
|
184 |
+
device_map = "auto"
|
185 |
+
# Set max memory for better balancing
|
186 |
+
max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory * 0.85 / 1024**3)}GiB" for i in range(gpu_count)}
|
187 |
+
logger.info(f"Max memory settings: {max_memory}")
|
188 |
+
else:
|
189 |
+
device_map = "auto"
|
190 |
+
max_memory = None
|
191 |
else:
|
192 |
logger.warning("No CUDA available, falling back to CPU")
|
193 |
device_map = {"": "cpu"} # Force CPU placement
|
194 |
+
max_memory = None
|
195 |
|
196 |
# Set default dtype for better numerics
|
197 |
if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
|
|
|
209 |
|
210 |
# Load model with proper error handling for out-of-memory
|
211 |
try:
|
212 |
+
# Improved memory settings for multi-GPU setup
|
213 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
214 |
+
|
215 |
model, tokenizer = FastLanguageModel.from_pretrained(
|
216 |
model_name=model_name,
|
217 |
max_seq_length=config.get("max_seq_length", 2048) or config.get("tokenizer", {}).get("max_seq_length", 2048),
|
218 |
dtype=dtype,
|
219 |
device_map=device_map,
|
220 |
+
max_memory=max_memory,
|
221 |
# Don't explicitly use flash attention config here, let Unsloth handle it
|
222 |
)
|
223 |
except RuntimeError as e:
|
|
|
393 |
|
394 |
role = message.get("role", "").lower()
|
395 |
content = message.get("content", "")
|
396 |
+
|
397 |
# Format based on role
|
398 |
if role == "human" or role == "user":
|
399 |
template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
|
|
|
451 |
return_tensors=None,
|
452 |
add_generation_prompt=False
|
453 |
)
|
454 |
+
except Exception as chat_error:
|
455 |
# Fallback if apply_chat_template fails
|
456 |
+
logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)[:100]}")
|
457 |
|
458 |
# Create a basic representation of the conversation
|
459 |
conversation_text = ""
|
|
|
532 |
|
533 |
return batch
|
534 |
|
535 |
+
class LoggingCallback(TrainerCallback):
|
536 |
+
def __init__(self):
|
537 |
+
self.last_log_time = time.time()
|
538 |
+
self.last_memory_log_time = time.time()
|
539 |
+
|
540 |
+
def on_step_end(self, args, state, control, **kwargs):
|
541 |
+
# Log every 50 steps or every 5 minutes, whichever comes first
|
542 |
+
current_time = time.time()
|
543 |
+
|
544 |
+
# Log loss every 50 steps or 5 minutes
|
545 |
+
if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
|
546 |
+
if state.log_history:
|
547 |
+
loss = state.log_history[-1].get('loss', 'N/A')
|
548 |
+
# Use simple formatting for better HF Space log compatibility
|
549 |
+
log_info(f"Step {state.global_step}: Loss {loss}")
|
550 |
+
else:
|
551 |
+
log_info(f"Step {state.global_step}: No loss data available")
|
552 |
+
self.last_log_time = current_time
|
553 |
+
|
554 |
+
# Log memory usage every 15 minutes
|
555 |
+
if current_time - self.last_memory_log_time > 900: # 15 minutes
|
556 |
+
if torch.cuda.is_available():
|
557 |
+
memory_info = []
|
558 |
+
for i in range(torch.cuda.device_count()):
|
559 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
560 |
+
reserved = torch.cuda.memory_reserved(i) / 1024**2
|
561 |
+
memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB")
|
562 |
+
|
563 |
+
# Log in compact format for better visibility
|
564 |
+
log_info(f"Memory usage - {', '.join(memory_info)}")
|
565 |
+
self.last_memory_log_time = current_time
|
566 |
+
|
567 |
+
def on_train_begin(self, args, state, control, **kwargs):
|
568 |
+
log_info("=== Training is starting ===")
|
569 |
+
|
570 |
+
# Log important training parameters for visibility
|
571 |
+
log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
|
572 |
+
log_info(f"Learning rate: {args.learning_rate}")
|
573 |
+
log_info(f"Epochs: {args.num_train_epochs}")
|
574 |
+
|
575 |
+
# Log memory information in compact format
|
576 |
+
if torch.cuda.is_available():
|
577 |
+
memory_info = []
|
578 |
+
for i in range(torch.cuda.device_count()):
|
579 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
580 |
+
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
581 |
+
memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
|
582 |
+
|
583 |
+
log_info(f"Initial memory usage - {', '.join(memory_info)}")
|
584 |
+
|
585 |
+
def on_train_end(self, args, state, control, **kwargs):
|
586 |
+
log_info("=== Training completed ===")
|
587 |
+
if torch.cuda.is_available():
|
588 |
+
memory_info = []
|
589 |
+
for i in range(torch.cuda.device_count()):
|
590 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
591 |
+
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
592 |
+
memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
|
593 |
+
|
594 |
+
log_info(f"Final memory usage - {', '.join(memory_info)}")
|
595 |
+
|
596 |
+
log_info(f"Total steps: {state.global_step}")
|
597 |
+
log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
|
598 |
+
|
599 |
def check_dependencies():
|
600 |
"""Check if all required dependencies are installed."""
|
601 |
missing_packages = []
|
|
|
627 |
|
628 |
def main():
|
629 |
# Set up logging
|
630 |
+
log_info("Starting Phi-4 fine-tuning process")
|
631 |
|
632 |
# Parse arguments
|
633 |
args = parse_args()
|
|
|
668 |
logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
|
669 |
return 1
|
670 |
|
671 |
+
model_name = model_config.get("model", {}).get("name") or model_config.get("model_name_or_path") or model_config.get("model_name")
|
672 |
+
log_info(f"Using model: {model_name}")
|
673 |
+
log_info("All configurations loaded successfully")
|
674 |
|
675 |
# Extract specific configs
|
676 |
model_config = configs["transformers"]
|
|
|
685 |
|
686 |
if per_device_batch_size and model_config.get("training"):
|
687 |
model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
|
688 |
+
log_info(f"Applied hardware-specific batch size: {per_device_batch_size}")
|
689 |
|
690 |
if gradient_accumulation and model_config.get("training"):
|
691 |
model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
|
692 |
+
log_info(f"Applied hardware-specific gradient accumulation: {gradient_accumulation}")
|
693 |
|
694 |
# Apply memory optimizations
|
695 |
memory_opts = training_opts.get("memory_optimizations", {})
|
|
|
703 |
# Set random seed for reproducibility
|
704 |
seed = model_config.get("seed", 42)
|
705 |
set_seed(seed)
|
706 |
+
log_info(f"Set random seed to {seed} for reproducibility")
|
707 |
|
708 |
# Check CUDA and set environment variables for better memory management
|
709 |
if torch.cuda.is_available():
|
710 |
# Empty CUDA cache
|
711 |
torch.cuda.empty_cache()
|
712 |
+
|
713 |
+
# Set memory management env vars for better fragmentation handling
|
714 |
+
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
|
715 |
+
|
716 |
+
# Log initial memory information in a compact form
|
717 |
+
gpu_info = []
|
718 |
for i in range(torch.cuda.device_count()):
|
719 |
+
name = torch.cuda.get_device_name(i)
|
720 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**3
|
721 |
+
total = torch.cuda.get_device_properties(i).total_memory / 1024**3
|
722 |
+
gpu_info.append(f"GPU {i}: {name} ({allocated:.1f}GB/{total:.1f}GB)")
|
723 |
+
|
724 |
+
log_info(f"Hardware: {torch.cuda.device_count()} GPUs detected")
|
725 |
+
log_info(f"GPU details: {', '.join(gpu_info)}")
|
726 |
+
else:
|
727 |
+
log_info("No GPU detected, using CPU (training will be very slow)")
|
728 |
|
729 |
try:
|
730 |
+
log_info("Loading model and tokenizer...")
|
731 |
model, tokenizer = load_model_and_tokenizer(model_config)
|
732 |
+
log_info("Model and tokenizer loaded successfully")
|
733 |
|
734 |
# Load dataset with proper mapping
|
735 |
try:
|
736 |
+
log_info(f"Loading dataset from {dataset_config.get('dataset', {}).get('name', '')}")
|
737 |
dataset = load_dataset_with_mapping(dataset_config)
|
738 |
+
log_info(f"Dataset loaded with {len(dataset)} examples")
|
739 |
except Exception as e:
|
740 |
logger.error(f"Error loading dataset: {e}")
|
741 |
return 1
|
|
|
743 |
# Create data collator
|
744 |
data_collator = SimpleDataCollator(tokenizer, dataset_config)
|
745 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
746 |
# Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
|
747 |
use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
|
748 |
use_fp16 = model_config.get("fp16", False) and not use_bf16 # Only use fp16 if bf16 is not set
|
749 |
|
750 |
+
log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
|
751 |
+
|
752 |
+
# Get per device batch size - temporarily reduce if necessary for multi-GPU setup
|
753 |
+
per_device_batch_size = model_config.get("training", {}).get("per_device_train_batch_size", 24)
|
754 |
+
gradient_accumulation_steps = model_config.get("training", {}).get("gradient_accumulation_steps", 2)
|
755 |
+
|
756 |
+
# For multi-GPU setup, adjust for better balance
|
757 |
+
if torch.cuda.device_count() > 1:
|
758 |
+
log_info(f"Multi-GPU setup with {torch.cuda.device_count()} GPUs")
|
759 |
+
log_info(f"Training config: {per_device_batch_size} samples/GPU × {gradient_accumulation_steps} accumulation steps")
|
760 |
+
|
761 |
+
# Set up FSDP for multi-GPU training if available
|
762 |
+
fsdp_config = None
|
763 |
+
if torch.cuda.device_count() > 1:
|
764 |
+
try:
|
765 |
+
from torch.distributed.fsdp import (
|
766 |
+
FullyShardedDataParallel as FSDP,
|
767 |
+
MixedPrecision,
|
768 |
+
BackwardPrefetch,
|
769 |
+
ShardingStrategy,
|
770 |
+
CPUOffload,
|
771 |
+
)
|
772 |
+
from torch.distributed.fsdp.wrap import (
|
773 |
+
transformer_auto_wrap_policy,
|
774 |
+
enable_wrap,
|
775 |
+
wrap,
|
776 |
+
)
|
777 |
+
|
778 |
+
log_info("Using FSDP for distributed training")
|
779 |
+
|
780 |
+
# Configure FSDP
|
781 |
+
fsdp_config = {
|
782 |
+
"fsdp_transformer_layer_cls_to_wrap": ["LlamaDecoderLayer"],
|
783 |
+
"fsdp_offload_params": False,
|
784 |
+
"fsdp_backward_prefetch": "BACKWARD_PRE",
|
785 |
+
"fsdp_min_num_params": 1e6,
|
786 |
+
"fsdp_sharding_strategy": 1, # FULL_SHARD
|
787 |
+
}
|
788 |
+
|
789 |
+
if use_bf16 or use_fp16:
|
790 |
+
precision_type = "bf16" if use_bf16 else "fp16"
|
791 |
+
fsdp_config["fsdp_state_dict_type"] = "FULL_STATE_DICT"
|
792 |
+
log_info(f"FSDP using mixed precision: {precision_type}")
|
793 |
+
except ImportError:
|
794 |
+
log_info("FSDP imports failed, falling back to standard DDP")
|
795 |
+
fsdp_config = None
|
796 |
|
797 |
# Set up training arguments
|
798 |
+
log_info("Setting up training arguments")
|
799 |
training_args = TrainingArguments(
|
800 |
output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
|
801 |
num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
|
802 |
+
per_device_train_batch_size=per_device_batch_size,
|
803 |
+
gradient_accumulation_steps=gradient_accumulation_steps,
|
804 |
learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
|
805 |
weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
|
806 |
warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
|
|
|
822 |
optim=model_config.get("training", {}).get("optim", "adamw_torch"),
|
823 |
ddp_find_unused_parameters=False, # Improve distributed training efficiency
|
824 |
dataloader_drop_last=False, # Process all examples
|
825 |
+
dataloader_num_workers=2, # Reduced worker count
|
826 |
no_cuda=False if torch.cuda.is_available() else True, # Use CUDA if available
|
827 |
+
fsdp=fsdp_config, # Add FSDP configuration if available
|
828 |
)
|
829 |
|
830 |
# Create sequential sampler to maintain original dataset order
|
831 |
sequential_sampler = torch.utils.data.SequentialSampler(dataset)
|
832 |
|
833 |
# Initialize trainer first
|
834 |
+
log_info("Initializing Trainer")
|
835 |
trainer = Trainer(
|
836 |
model=model,
|
837 |
args=training_args,
|
|
|
843 |
# Then override the get_train_dataloader method
|
844 |
def custom_get_train_dataloader():
|
845 |
"""Custom dataloader that preserves original dataset order"""
|
846 |
+
log_info("Creating sequential dataloader to maintain original dataset order")
|
847 |
|
848 |
# Calculate batch size based on device availability
|
849 |
if getattr(training_args, "no_cuda", False):
|
|
|
851 |
else:
|
852 |
batch_size = max(training_args.per_device_train_batch_size * max(1, torch.cuda.device_count()), 1)
|
853 |
|
854 |
+
log_info(f"Using sequential sampler with batch size {batch_size}")
|
855 |
|
856 |
# Return DataLoader with sequential sampler
|
857 |
return torch.utils.data.DataLoader(
|
|
|
868 |
trainer.get_train_dataloader = custom_get_train_dataloader
|
869 |
|
870 |
# Start training
|
871 |
+
log_info("=== Starting Training ===")
|
872 |
try:
|
873 |
+
# Empty cache again right before training
|
874 |
+
if torch.cuda.is_available():
|
875 |
+
torch.cuda.empty_cache()
|
876 |
+
log_info("Cleared CUDA cache before training")
|
877 |
+
|
878 |
+
# Display compact training info
|
879 |
+
total_steps = int(len(dataset) / (per_device_batch_size * torch.cuda.device_count() * gradient_accumulation_steps) * training_args.num_train_epochs)
|
880 |
+
log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
|
881 |
+
|
882 |
trainer.train()
|
883 |
+
log_info("Training completed successfully!")
|
884 |
|
885 |
# Save the final model
|
886 |
+
log_info("Saving final model...")
|
887 |
trainer.save_model()
|
888 |
+
log_info(f"Model saved to {training_args.output_dir}")
|
889 |
|
890 |
# Push to hub if enabled
|
891 |
if model_config.get("huggingface_hub", {}).get("push_to_hub", False):
|
892 |
+
hub_id = model_config.get("huggingface_hub", {}).get("hub_model_id", "model")
|
893 |
+
log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
|
894 |
trainer.push_to_hub()
|
895 |
+
log_info("Model successfully pushed to Hub")
|
896 |
|
897 |
return 0
|
898 |
except Exception as e:
|
899 |
logger.error(f"Training failed with error: {str(e)}")
|
900 |
+
# Log CUDA memory info if available in compact format
|
901 |
if torch.cuda.is_available():
|
902 |
+
memory_info = []
|
903 |
for i in range(torch.cuda.device_count()):
|
904 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
905 |
+
reserved = torch.cuda.memory_reserved(i) / 1024**2
|
906 |
+
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
907 |
+
memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB (max: {max_mem:.1f}MB)")
|
908 |
+
logger.error(f"GPU memory at failure: {', '.join(memory_info)}")
|
909 |
raise
|
910 |
|
911 |
except Exception as e:
|
temp_function_fixes.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def format_phi_chat(messages, dataset_config):
|
2 |
+
"""Format messages according to phi-4's chat template and dataset config."""
|
3 |
+
formatted_chat = ""
|
4 |
+
|
5 |
+
# Get role templates from config
|
6 |
+
roles = dataset_config.get("data_formatting", {}).get("roles", {
|
7 |
+
"system": "System: {content}\n\n",
|
8 |
+
"human": "Human: {content}\n\n",
|
9 |
+
"user": "Human: {content}\n\n",
|
10 |
+
"assistant": "Assistant: {content}\n\n"
|
11 |
+
})
|
12 |
+
|
13 |
+
# Handle research introduction metadata first
|
14 |
+
metadata = next((msg for msg in messages if isinstance(msg, dict) and
|
15 |
+
"[RESEARCH INTRODUCTION]" in msg.get("content", "")), None)
|
16 |
+
if metadata:
|
17 |
+
system_template = roles.get("system", "System: {content}\n\n")
|
18 |
+
formatted_chat = system_template.format(content=metadata['content'])
|
19 |
+
messages = [msg for msg in messages if msg != metadata]
|
20 |
+
|
21 |
+
# Process remaining messages
|
22 |
+
for message in messages:
|
23 |
+
if not isinstance(message, dict) or "content" not in message:
|
24 |
+
logger.warning(f"Skipping invalid message format: {message}")
|
25 |
+
continue
|
26 |
+
|
27 |
+
role = message.get("role", "").lower()
|
28 |
+
content = message.get("content", "")
|
29 |
+
|
30 |
+
# Format based on role
|
31 |
+
if role == "human" or role == "user":
|
32 |
+
template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
|
33 |
+
formatted_chat += template.format(content=content)
|
34 |
+
elif role == "assistant" or role == "bot":
|
35 |
+
template = roles.get("assistant", "Assistant: {content}\n\n")
|
36 |
+
formatted_chat += template.format(content=content)
|
37 |
+
elif role == "system":
|
38 |
+
# For system messages, prepend them
|
39 |
+
template = roles.get("system", "System: {content}\n\n")
|
40 |
+
formatted_chat = template.format(content=content) + formatted_chat
|
41 |
+
else:
|
42 |
+
# Default to system for unknown roles
|
43 |
+
logger.warning(f"Unknown role '{role}' - treating as system message")
|
44 |
+
template = roles.get("system", "System: {content}\n\n")
|
45 |
+
formatted_chat += template.format(content=content)
|
46 |
+
|
47 |
+
return formatted_chat.strip()
|
48 |
+
|
49 |
+
class SimpleDataCollator:
|
50 |
+
def __init__(self, tokenizer, dataset_config):
|
51 |
+
self.tokenizer = tokenizer
|
52 |
+
self.dataset_config = dataset_config
|
53 |
+
self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
|
54 |
+
self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
|
55 |
+
self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
|
56 |
+
logger.info(f"SimpleDataCollator initialized - using pre-audited dataset with max_seq_length={self.max_seq_length}")
|
57 |
+
logger.info("Using exact dataset structure without reformatting")
|
58 |
+
|
59 |
+
# Check if we're on GPU
|
60 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
61 |
+
logger.info(f"SimpleDataCollator using device: {self.device}")
|
62 |
+
|
63 |
+
def __call__(self, features):
|
64 |
+
"""Process examples preserving exact JSONL structure"""
|
65 |
+
batch = {"input_ids": [], "attention_mask": [], "labels": []}
|
66 |
+
|
67 |
+
for example in features:
|
68 |
+
try:
|
69 |
+
# Get ID
|
70 |
+
paper_id = example.get("id", "")
|
71 |
+
|
72 |
+
# Get conversations - these should already contain role and content
|
73 |
+
conversations = example.get("conversations", [])
|
74 |
+
if not conversations:
|
75 |
+
self.stats["skipped"] += 1
|
76 |
+
continue
|
77 |
+
|
78 |
+
# Directly use the conversations array as input to the model's chat template
|
79 |
+
# This preserves the exact structure with roles and content as they are
|
80 |
+
try:
|
81 |
+
# Let tokenizer handle the content with the model's chat template
|
82 |
+
inputs = self.tokenizer.apply_chat_template(
|
83 |
+
conversations,
|
84 |
+
return_tensors=None,
|
85 |
+
add_generation_prompt=False
|
86 |
+
)
|
87 |
+
except Exception as chat_error:
|
88 |
+
# Fallback if apply_chat_template fails
|
89 |
+
logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)[:100]}")
|
90 |
+
|
91 |
+
# Create a basic representation of the conversation
|
92 |
+
conversation_text = ""
|
93 |
+
for msg in conversations:
|
94 |
+
if isinstance(msg, dict) and 'content' in msg:
|
95 |
+
conversation_text += msg.get('content', '') + "\n\n"
|
96 |
+
|
97 |
+
# Basic tokenization
|
98 |
+
inputs = self.tokenizer(
|
99 |
+
conversation_text,
|
100 |
+
add_special_tokens=True,
|
101 |
+
return_tensors=None
|
102 |
+
)
|
103 |
+
|
104 |
+
# Apply length cap if needed (shouldn't be necessary for pre-audited data)
|
105 |
+
if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
|
106 |
+
logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
|
107 |
+
inputs = inputs[:self.max_seq_length]
|
108 |
+
|
109 |
+
# Create attention mask (1 for all tokens)
|
110 |
+
attention_mask = [1] * len(inputs)
|
111 |
+
|
112 |
+
if len(inputs) > 0:
|
113 |
+
# For causal language modeling, labels are the same as inputs
|
114 |
+
labels = inputs.copy()
|
115 |
+
|
116 |
+
batch["input_ids"].append(inputs)
|
117 |
+
batch["attention_mask"].append(attention_mask)
|
118 |
+
batch["labels"].append(labels)
|
119 |
+
|
120 |
+
self.stats["processed"] += 1
|
121 |
+
self.stats["total_tokens"] += len(inputs)
|
122 |
+
|
123 |
+
# Debug logging for first few examples
|
124 |
+
log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
|
125 |
+
if self.stats["processed"] <= log_samples:
|
126 |
+
logger.info(f"Example {self.stats['processed']}:")
|
127 |
+
logger.info(f"Paper ID: {paper_id}")
|
128 |
+
logger.info(f"Token count: {len(inputs)}")
|
129 |
+
logger.info(f"Conversation entries: {len(conversations)}")
|
130 |
+
else:
|
131 |
+
self.stats["skipped"] += 1
|
132 |
+
except Exception as e:
|
133 |
+
logger.warning(f"Error processing example: {str(e)[:100]}...")
|
134 |
+
logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
|
135 |
+
self.stats["skipped"] += 1
|
136 |
+
continue
|
137 |
+
|
138 |
+
if not batch["input_ids"]:
|
139 |
+
logger.warning("Empty batch, returning dummy tensors")
|
140 |
+
return {
|
141 |
+
"input_ids": torch.zeros((1, 1), dtype=torch.long),
|
142 |
+
"attention_mask": torch.zeros((1, 1), dtype=torch.long),
|
143 |
+
"labels": torch.zeros((1, 1), dtype=torch.long)
|
144 |
+
}
|
145 |
+
|
146 |
+
# Pad the batch
|
147 |
+
max_length = max(len(ids) for ids in batch["input_ids"])
|
148 |
+
|
149 |
+
for i in range(len(batch["input_ids"])):
|
150 |
+
padding_length = max_length - len(batch["input_ids"][i])
|
151 |
+
if padding_length > 0:
|
152 |
+
batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
|
153 |
+
batch["attention_mask"][i].extend([0] * padding_length)
|
154 |
+
batch["labels"][i].extend([-100] * padding_length)
|
155 |
+
|
156 |
+
# Convert to tensors
|
157 |
+
batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
|
158 |
+
|
159 |
+
# Log stats periodically
|
160 |
+
log_interval = self.dataset_config.get("validation", {}).get("log_interval", 100)
|
161 |
+
if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
|
162 |
+
logger.info(f"Data collator stats: processed={self.stats['processed']}, "
|
163 |
+
f"skipped={self.stats['skipped']}, "
|
164 |
+
f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}")
|
165 |
+
|
166 |
+
return batch
|
167 |
+
|
168 |
+
class LoggingCallback(TrainerCallback):
|
169 |
+
def __init__(self):
|
170 |
+
self.last_log_time = time.time()
|
171 |
+
self.last_memory_log_time = time.time()
|
172 |
+
|
173 |
+
def on_step_end(self, args, state, control, **kwargs):
|
174 |
+
# Log every 50 steps or every 5 minutes, whichever comes first
|
175 |
+
current_time = time.time()
|
176 |
+
|
177 |
+
# Log loss every 50 steps or 5 minutes
|
178 |
+
if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
|
179 |
+
if state.log_history:
|
180 |
+
loss = state.log_history[-1].get('loss', 'N/A')
|
181 |
+
# Use simple formatting for better HF Space log compatibility
|
182 |
+
log_info(f"Step {state.global_step}: Loss {loss}")
|
183 |
+
else:
|
184 |
+
log_info(f"Step {state.global_step}: No loss data available")
|
185 |
+
self.last_log_time = current_time
|
186 |
+
|
187 |
+
# Log memory usage every 15 minutes
|
188 |
+
if current_time - self.last_memory_log_time > 900: # 15 minutes
|
189 |
+
if torch.cuda.is_available():
|
190 |
+
memory_info = []
|
191 |
+
for i in range(torch.cuda.device_count()):
|
192 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
193 |
+
reserved = torch.cuda.memory_reserved(i) / 1024**2
|
194 |
+
memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB")
|
195 |
+
|
196 |
+
# Log in compact format for better visibility
|
197 |
+
log_info(f"Memory usage - {', '.join(memory_info)}")
|
198 |
+
self.last_memory_log_time = current_time
|
199 |
+
|
200 |
+
def on_train_begin(self, args, state, control, **kwargs):
|
201 |
+
log_info("=== Training is starting ===")
|
202 |
+
|
203 |
+
# Log important training parameters for visibility
|
204 |
+
log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
|
205 |
+
log_info(f"Learning rate: {args.learning_rate}")
|
206 |
+
log_info(f"Epochs: {args.num_train_epochs}")
|
207 |
+
|
208 |
+
# Log memory information in compact format
|
209 |
+
if torch.cuda.is_available():
|
210 |
+
memory_info = []
|
211 |
+
for i in range(torch.cuda.device_count()):
|
212 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
213 |
+
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
214 |
+
memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
|
215 |
+
|
216 |
+
log_info(f"Initial memory usage - {', '.join(memory_info)}")
|
217 |
+
|
218 |
+
def on_train_end(self, args, state, control, **kwargs):
|
219 |
+
log_info("=== Training completed ===")
|
220 |
+
if torch.cuda.is_available():
|
221 |
+
memory_info = []
|
222 |
+
for i in range(torch.cuda.device_count()):
|
223 |
+
allocated = torch.cuda.memory_allocated(i) / 1024**2
|
224 |
+
max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
|
225 |
+
memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
|
226 |
+
|
227 |
+
log_info(f"Final memory usage - {', '.join(memory_info)}")
|
228 |
+
|
229 |
+
log_info(f"Total steps: {state.global_step}")
|
230 |
+
log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
|
transformers_config.json
CHANGED
@@ -13,8 +13,8 @@
|
|
13 |
},
|
14 |
|
15 |
"training": {
|
16 |
-
"per_device_train_batch_size":
|
17 |
-
"gradient_accumulation_steps":
|
18 |
"learning_rate": 2e-5,
|
19 |
"num_train_epochs": 3,
|
20 |
"max_steps": -1,
|
@@ -67,7 +67,7 @@
|
|
67 |
"offload_params": false
|
68 |
},
|
69 |
"ddp_find_unused_parameters": false,
|
70 |
-
"dataloader_num_workers":
|
71 |
},
|
72 |
|
73 |
"logging": {
|
|
|
13 |
},
|
14 |
|
15 |
"training": {
|
16 |
+
"per_device_train_batch_size": 16,
|
17 |
+
"gradient_accumulation_steps": 3,
|
18 |
"learning_rate": 2e-5,
|
19 |
"num_train_epochs": 3,
|
20 |
"max_steps": -1,
|
|
|
67 |
"offload_params": false
|
68 |
},
|
69 |
"ddp_find_unused_parameters": false,
|
70 |
+
"dataloader_num_workers": 2
|
71 |
},
|
72 |
|
73 |
"logging": {
|