hf-train-frontend

Runtime error

App Files Files Community

George-API commited on Mar 9

Commit

adb15f9

verified ·

1 Parent(s): 7e5a6ad

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

hardware_config.json +55 -55
run_transformers_training.py +210 -53
temp_function_fixes.py +230 -0
transformers_config.json +3 -3

hardware_config.json CHANGED Viewed

@@ -1,56 +1,56 @@
-{
-  "hardware_name": "4xL4",
-  "specs": {
-    "gpu_count": 4,
-    "gpu_type": "L4",
-    "vram_per_gpu": 24,
-    "total_vram": 96,
-    "vcpu_count": 48,
-    "ram": 186
-  },
-  "training_optimizations": {
-    "per_device_batch_size": 24,
-    "gradient_accumulation_steps": 2,
-    "effective_batch_size": 192,
-    "memory_optimizations": {
-      "use_gradient_checkpointing": true,
-      "pin_memory": true,
-      "num_workers": 4,
-      "use_flash_attention": true
-    },
-    "distributed_settings": {
-      "device_map": "auto",
-      "ddp_find_unused_parameters": false,
-      "use_fsdp": true,
-      "fsdp_config": {
-        "sharding_strategy": "FULL_SHARD",
-        "mixed_precision": "BF16",
-        "activation_checkpointing": true
-      }
-    }
-  },
-  "memory_breakdown": {
-    "model_size": "~3.5GB (pre-quantized 4-bit)",
-    "optimizer_states": "~1GB",
-    "batch_memory_per_gpu": "~3GB",
-    "peak_memory_estimate": "~18GB",
-    "safe_headroom": "~6GB"
-  },
-  "compute_environment": "L4_CLOUD",
-  "distributed_type": "FSDP",
-  "mixed_precision": "bf16",
-  "num_gpus": 4,
-  "training_parameters": {
-    "per_device_train_batch_size": 24,
-    "gradient_accumulation_steps": 2,
-    "dataloader_num_workers": 4,
-    "dataloader_pin_memory": true,
-    "gradient_checkpointing": true,
-    "max_grad_norm": 1.0
-  },
-  "memory_optimization": {
-    "offload_to_cpu": false,
-    "use_flash_attention": true,
-    "use_gradient_checkpointing": true
-  }
 }

+{
+  "hardware_name": "4xL4",
+  "specs": {
+    "gpu_count": 4,
+    "gpu_type": "L4",
+    "vram_per_gpu": 24,
+    "total_vram": 96,
+    "vcpu_count": 48,
+    "ram": 186
+  },
+  "training_optimizations": {
+    "per_device_batch_size": 24,
+    "gradient_accumulation_steps": 2,
+    "effective_batch_size": 192,
+    "memory_optimizations": {
+      "use_gradient_checkpointing": true,
+      "pin_memory": true,
+      "num_workers": 4,
+      "use_flash_attention": true
+    },
+    "distributed_settings": {
+      "device_map": "auto",
+      "ddp_find_unused_parameters": false,
+      "use_fsdp": true,
+      "fsdp_config": {
+        "sharding_strategy": "FULL_SHARD",
+        "mixed_precision": "BF16",
+        "activation_checkpointing": true
+      }
+    }
+  },
+  "memory_breakdown": {
+    "model_size": "~3.5GB (pre-quantized 4-bit)",
+    "optimizer_states": "~1GB",
+    "batch_memory_per_gpu": "~3GB",
+    "peak_memory_estimate": "~18GB",
+    "safe_headroom": "~6GB"
+  },
+  "compute_environment": "L4_CLOUD",
+  "distributed_type": "FSDP",
+  "mixed_precision": "bf16",
+  "num_gpus": 4,
+  "training_parameters": {
+    "per_device_train_batch_size": 24,
+    "gradient_accumulation_steps": 2,
+    "dataloader_num_workers": 4,
+    "dataloader_pin_memory": true,
+    "gradient_checkpointing": true,
+    "max_grad_norm": 1.0
+  },
+  "memory_optimization": {
+    "offload_to_cpu": false,
+    "use_flash_attention": true,
+    "use_gradient_checkpointing": true
+  }
 }

run_transformers_training.py CHANGED Viewed

@@ -39,6 +39,21 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
 # Check for BitsAndBytes
 try:
     from transformers import BitsAndBytesConfig
@@ -142,6 +157,7 @@ def load_model_and_tokenizer(config):
             raise ValueError("Model name not found in configuration. Please check your transformers_config.json file.")
         logger.info("Using Unsloth optimizations with pre-quantized model")
         # Check for flash attention without importing it directly
         use_flash_attention = config.get("use_flash_attention", True)
         try:
@@ -153,11 +169,29 @@ def load_model_and_tokenizer(config):
         # First detect if we have a GPU
         if torch.cuda.is_available():
-            logger.info(f"CUDA available, found {torch.cuda.device_count()} GPU(s)")
-            device_map = "auto"
         else:
             logger.warning("No CUDA available, falling back to CPU")
             device_map = {"": "cpu"}  # Force CPU placement
         # Set default dtype for better numerics
         if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
@@ -175,11 +209,15 @@ def load_model_and_tokenizer(config):
         # Load model with proper error handling for out-of-memory
         try:
             model, tokenizer = FastLanguageModel.from_pretrained(
                 model_name=model_name,
                 max_seq_length=config.get("max_seq_length", 2048) or config.get("tokenizer", {}).get("max_seq_length", 2048),
                 dtype=dtype,
                 device_map=device_map,
                 # Don't explicitly use flash attention config here, let Unsloth handle it
             )
         except RuntimeError as e:
@@ -355,7 +393,7 @@ def format_phi_chat(messages, dataset_config):
         role = message.get("role", "").lower()
         content = message.get("content", "")
         # Format based on role
         if role == "human" or role == "user":
             template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
@@ -413,9 +451,9 @@ class SimpleDataCollator:
                         return_tensors=None,
                         add_generation_prompt=False
                     )
-                except:
                     # Fallback if apply_chat_template fails
-                    logger.warning(f"Chat template application failed for example {paper_id}, using basic tokenization")
                     # Create a basic representation of the conversation
                     conversation_text = ""
@@ -494,6 +532,70 @@ class SimpleDataCollator:
         return batch
 def check_dependencies():
     """Check if all required dependencies are installed."""
     missing_packages = []
@@ -525,7 +627,7 @@ def check_dependencies():
 def main():
     # Set up logging
-    logger.info("Starting training process")
     # Parse arguments
     args = parse_args()
@@ -566,8 +668,9 @@ def main():
             logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
             return 1
-        logger.info(f"Model name: {model_config.get('model', {}).get('name') or model_config.get('model_name_or_path') or model_config.get('model_name')}")
-        logger.info("All configurations loaded successfully")
         # Extract specific configs
         model_config = configs["transformers"]
@@ -582,11 +685,11 @@ def main():
             if per_device_batch_size and model_config.get("training"):
                 model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
-                logger.info(f"Applied hardware-specific batch size: {per_device_batch_size}")
             if gradient_accumulation and model_config.get("training"):
                 model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
-                logger.info(f"Applied hardware-specific gradient accumulation: {gradient_accumulation}")
             # Apply memory optimizations
             memory_opts = training_opts.get("memory_optimizations", {})
@@ -600,28 +703,39 @@ def main():
     # Set random seed for reproducibility
     seed = model_config.get("seed", 42)
     set_seed(seed)
-    logger.info(f"Set random seed to {seed}")
     # Check CUDA and set environment variables for better memory management
     if torch.cuda.is_available():
         # Empty CUDA cache
         torch.cuda.empty_cache()
-        # Set memory management env vars (optional)
-        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
-        # Log memory information
         for i in range(torch.cuda.device_count()):
-            logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
-            logger.info(f"Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
-            logger.info(f"Memory Reserved: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
     try:
         model, tokenizer = load_model_and_tokenizer(model_config)
-        logger.info("Model and tokenizer loaded successfully")
         # Load dataset with proper mapping
         try:
             dataset = load_dataset_with_mapping(dataset_config)
-            logger.info("Dataset loaded and prepared successfully")
         except Exception as e:
             logger.error(f"Error loading dataset: {e}")
             return 1
@@ -629,38 +743,64 @@ def main():
         # Create data collator
         data_collator = SimpleDataCollator(tokenizer, dataset_config)
-        # Simple logging callback
-        class LoggingCallback(TrainerCallback):
-            def __init__(self):
-                self.last_log_time = time.time()
-            def on_step_end(self, args, state, control, **kwargs):
-                # Log every 50 steps or every 5 minutes, whichever comes first
-                current_time = time.time()
-                if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
-                    logger.info(f"Step {state.global_step}: Loss {state.log_history[-1]['loss'] if state.log_history else 'N/A'}")
-                    self.last_log_time = current_time
-            def on_train_begin(self, args, state, control, **kwargs):
-                logger.info("Training is starting...")
-                # Log memory information
-                if torch.cuda.is_available():
-                    for i in range(torch.cuda.device_count()):
-                        logger.info(f"GPU {i} Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
         # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
         use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
         use_fp16 = model_config.get("fp16", False) and not use_bf16  # Only use fp16 if bf16 is not set
-        logger.info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
         # Set up training arguments
-        logger.info("Setting up training arguments")
         training_args = TrainingArguments(
             output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
             num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
-            per_device_train_batch_size=model_config.get("training", {}).get("per_device_train_batch_size", 24),
-            gradient_accumulation_steps=model_config.get("training", {}).get("gradient_accumulation_steps", 2),
             learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
             weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
             warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
@@ -682,15 +822,16 @@ def main():
             optim=model_config.get("training", {}).get("optim", "adamw_torch"),
             ddp_find_unused_parameters=False,  # Improve distributed training efficiency
             dataloader_drop_last=False,  # Process all examples
-            dataloader_num_workers=4,  # Sequential data loading
             no_cuda=False if torch.cuda.is_available() else True,  # Use CUDA if available
         )
         # Create sequential sampler to maintain original dataset order
         sequential_sampler = torch.utils.data.SequentialSampler(dataset)
         # Initialize trainer first
-        logger.info("Initializing Trainer")
         trainer = Trainer(
             model=model,
             args=training_args,
@@ -702,7 +843,7 @@ def main():
         # Then override the get_train_dataloader method
         def custom_get_train_dataloader():
             """Custom dataloader that preserves original dataset order"""
-            logger.info("Creating sequential dataloader to maintain original dataset order")
             # Calculate batch size based on device availability
             if getattr(training_args, "no_cuda", False):
@@ -710,7 +851,7 @@ def main():
             else:
                 batch_size = max(training_args.per_device_train_batch_size * max(1, torch.cuda.device_count()), 1)
-            logger.info(f"Using sequential sampler with batch size {batch_size}")
             # Return DataLoader with sequential sampler
             return torch.utils.data.DataLoader(
@@ -727,28 +868,44 @@ def main():
         trainer.get_train_dataloader = custom_get_train_dataloader
         # Start training
-        logger.info("Starting training process")
         try:
             trainer.train()
-            logger.info("Training completed successfully")
             # Save the final model
-            logger.info("Saving final model")
             trainer.save_model()
             # Push to hub if enabled
             if model_config.get("huggingface_hub", {}).get("push_to_hub", False):
-                logger.info("Pushing model to Hugging Face Hub")
                 trainer.push_to_hub()
             return 0
         except Exception as e:
             logger.error(f"Training failed with error: {str(e)}")
-            # Log CUDA memory info if available
             if torch.cuda.is_available():
                 for i in range(torch.cuda.device_count()):
-                    logger.info(f"GPU {i} Memory Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB")
-                    logger.info(f"GPU {i} Memory Reserved: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB")
             raise
     except Exception as e:

 )
 logger = logging.getLogger(__name__)
+# Set other loggers to WARNING to reduce noise and ensure our logs are visible
+logging.getLogger("transformers").setLevel(logging.WARNING)
+logging.getLogger("datasets").setLevel(logging.WARNING)
+logging.getLogger("accelerate").setLevel(logging.WARNING)
+logging.getLogger("torch").setLevel(logging.WARNING)
+logging.getLogger("bitsandbytes").setLevel(logging.WARNING)
+# Define a clean logging function for HF Space compatibility
+def log_info(message):
+    """Log information in a format compatible with Hugging Face Spaces"""
+    # Just use the logger, but ensure consistent formatting
+    logger.info(message)
+    # Also ensure output is flushed immediately for streaming
+    sys.stdout.flush()
 # Check for BitsAndBytes
 try:
     from transformers import BitsAndBytesConfig
             raise ValueError("Model name not found in configuration. Please check your transformers_config.json file.")
         logger.info("Using Unsloth optimizations with pre-quantized model")
         # Check for flash attention without importing it directly
         use_flash_attention = config.get("use_flash_attention", True)
         try:
         # First detect if we have a GPU
         if torch.cuda.is_available():
+            gpu_count = torch.cuda.device_count()
+            logger.info(f"CUDA available, found {gpu_count} GPU(s)")
+            # Log GPU info
+            for i in range(gpu_count):
+                logger.info(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+                logger.info(f"Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")
+            # Create an optimized device map for better balance
+            if gpu_count > 1:
+                logger.info(f"Creating balanced device map for {gpu_count} GPUs")
+                # Use auto mapping but with memory tracking
+                device_map = "auto"
+                # Set max memory for better balancing
+                max_memory = {i: f"{int(torch.cuda.get_device_properties(i).total_memory * 0.85 / 1024**3)}GiB" for i in range(gpu_count)}
+                logger.info(f"Max memory settings: {max_memory}")
+            else:
+                device_map = "auto"
+                max_memory = None
         else:
             logger.warning("No CUDA available, falling back to CPU")
             device_map = {"": "cpu"}  # Force CPU placement
+            max_memory = None
         # Set default dtype for better numerics
         if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
         # Load model with proper error handling for out-of-memory
         try:
+            # Improved memory settings for multi-GPU setup
+            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
             model, tokenizer = FastLanguageModel.from_pretrained(
                 model_name=model_name,
                 max_seq_length=config.get("max_seq_length", 2048) or config.get("tokenizer", {}).get("max_seq_length", 2048),
                 dtype=dtype,
                 device_map=device_map,
+                max_memory=max_memory,
                 # Don't explicitly use flash attention config here, let Unsloth handle it
             )
         except RuntimeError as e:
         role = message.get("role", "").lower()
         content = message.get("content", "")
         # Format based on role
         if role == "human" or role == "user":
             template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
                         return_tensors=None,
                         add_generation_prompt=False
                     )
+                except Exception as chat_error:
                     # Fallback if apply_chat_template fails
+                    logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)[:100]}")
                     # Create a basic representation of the conversation
                     conversation_text = ""
         return batch
+class LoggingCallback(TrainerCallback):
+    def __init__(self):
+        self.last_log_time = time.time()
+        self.last_memory_log_time = time.time()
+    def on_step_end(self, args, state, control, **kwargs):
+        # Log every 50 steps or every 5 minutes, whichever comes first
+        current_time = time.time()
+        # Log loss every 50 steps or 5 minutes
+        if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
+            if state.log_history:
+                loss = state.log_history[-1].get('loss', 'N/A')
+                # Use simple formatting for better HF Space log compatibility
+                log_info(f"Step {state.global_step}: Loss {loss}")
+            else:
+                log_info(f"Step {state.global_step}: No loss data available")
+            self.last_log_time = current_time
+        # Log memory usage every 15 minutes
+        if current_time - self.last_memory_log_time > 900:  # 15 minutes
+            if torch.cuda.is_available():
+                memory_info = []
+                for i in range(torch.cuda.device_count()):
+                    allocated = torch.cuda.memory_allocated(i) / 1024**2
+                    reserved = torch.cuda.memory_reserved(i) / 1024**2
+                    memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB")
+                # Log in compact format for better visibility
+                log_info(f"Memory usage - {', '.join(memory_info)}")
+            self.last_memory_log_time = current_time
+    def on_train_begin(self, args, state, control, **kwargs):
+        log_info("=== Training is starting ===")
+        # Log important training parameters for visibility
+        log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
+        log_info(f"Learning rate: {args.learning_rate}")
+        log_info(f"Epochs: {args.num_train_epochs}")
+        # Log memory information in compact format
+        if torch.cuda.is_available():
+            memory_info = []
+            for i in range(torch.cuda.device_count()):
+                allocated = torch.cuda.memory_allocated(i) / 1024**2
+                max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
+                memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
+            log_info(f"Initial memory usage - {', '.join(memory_info)}")
+    def on_train_end(self, args, state, control, **kwargs):
+        log_info("=== Training completed ===")
+        if torch.cuda.is_available():
+            memory_info = []
+            for i in range(torch.cuda.device_count()):
+                allocated = torch.cuda.memory_allocated(i) / 1024**2
+                max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
+                memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
+            log_info(f"Final memory usage - {', '.join(memory_info)}")
+        log_info(f"Total steps: {state.global_step}")
+        log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")
 def check_dependencies():
     """Check if all required dependencies are installed."""
     missing_packages = []
 def main():
     # Set up logging
+    log_info("Starting Phi-4 fine-tuning process")
     # Parse arguments
     args = parse_args()
             logger.error("Please ensure 'name' is specified under 'model' in transformers_config.json")
             return 1
+        model_name = model_config.get("model", {}).get("name") or model_config.get("model_name_or_path") or model_config.get("model_name")
+        log_info(f"Using model: {model_name}")
+        log_info("All configurations loaded successfully")
         # Extract specific configs
         model_config = configs["transformers"]
             if per_device_batch_size and model_config.get("training"):
                 model_config["training"]["per_device_train_batch_size"] = per_device_batch_size
+                log_info(f"Applied hardware-specific batch size: {per_device_batch_size}")
             if gradient_accumulation and model_config.get("training"):
                 model_config["training"]["gradient_accumulation_steps"] = gradient_accumulation
+                log_info(f"Applied hardware-specific gradient accumulation: {gradient_accumulation}")
             # Apply memory optimizations
             memory_opts = training_opts.get("memory_optimizations", {})
     # Set random seed for reproducibility
     seed = model_config.get("seed", 42)
     set_seed(seed)
+    log_info(f"Set random seed to {seed} for reproducibility")
     # Check CUDA and set environment variables for better memory management
     if torch.cuda.is_available():
         # Empty CUDA cache
         torch.cuda.empty_cache()
+        # Set memory management env vars for better fragmentation handling
+        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
+        # Log initial memory information in a compact form
+        gpu_info = []
         for i in range(torch.cuda.device_count()):
+            name = torch.cuda.get_device_name(i)
+            allocated = torch.cuda.memory_allocated(i) / 1024**3
+            total = torch.cuda.get_device_properties(i).total_memory / 1024**3
+            gpu_info.append(f"GPU {i}: {name} ({allocated:.1f}GB/{total:.1f}GB)")
+        log_info(f"Hardware: {torch.cuda.device_count()} GPUs detected")
+        log_info(f"GPU details: {', '.join(gpu_info)}")
+    else:
+        log_info("No GPU detected, using CPU (training will be very slow)")
     try:
+        log_info("Loading model and tokenizer...")
         model, tokenizer = load_model_and_tokenizer(model_config)
+        log_info("Model and tokenizer loaded successfully")
         # Load dataset with proper mapping
         try:
+            log_info(f"Loading dataset from {dataset_config.get('dataset', {}).get('name', '')}")
             dataset = load_dataset_with_mapping(dataset_config)
+            log_info(f"Dataset loaded with {len(dataset)} examples")
         except Exception as e:
             logger.error(f"Error loading dataset: {e}")
             return 1
         # Create data collator
         data_collator = SimpleDataCollator(tokenizer, dataset_config)
         # Verify precision settings - ensure only one of bf16/fp16 is set, with bf16 taking precedence
         use_bf16 = model_config.get("bf16", False) or model_config.get("torch_dtype", "") == "bfloat16"
         use_fp16 = model_config.get("fp16", False) and not use_bf16  # Only use fp16 if bf16 is not set
+        log_info(f"Using precision: {'bf16' if use_bf16 else 'fp16' if use_fp16 else 'full precision'}")
+        # Get per device batch size - temporarily reduce if necessary for multi-GPU setup
+        per_device_batch_size = model_config.get("training", {}).get("per_device_train_batch_size", 24)
+        gradient_accumulation_steps = model_config.get("training", {}).get("gradient_accumulation_steps", 2)
+        # For multi-GPU setup, adjust for better balance
+        if torch.cuda.device_count() > 1:
+            log_info(f"Multi-GPU setup with {torch.cuda.device_count()} GPUs")
+            log_info(f"Training config: {per_device_batch_size} samples/GPU × {gradient_accumulation_steps} accumulation steps")
+        # Set up FSDP for multi-GPU training if available
+        fsdp_config = None
+        if torch.cuda.device_count() > 1:
+            try:
+                from torch.distributed.fsdp import (
+                    FullyShardedDataParallel as FSDP,
+                    MixedPrecision,
+                    BackwardPrefetch,
+                    ShardingStrategy,
+                    CPUOffload,
+                )
+                from torch.distributed.fsdp.wrap import (
+                    transformer_auto_wrap_policy,
+                    enable_wrap,
+                    wrap,
+                )
+                log_info("Using FSDP for distributed training")
+                # Configure FSDP
+                fsdp_config = {
+                    "fsdp_transformer_layer_cls_to_wrap": ["LlamaDecoderLayer"],
+                    "fsdp_offload_params": False,
+                    "fsdp_backward_prefetch": "BACKWARD_PRE",
+                    "fsdp_min_num_params": 1e6,
+                    "fsdp_sharding_strategy": 1,  # FULL_SHARD
+                }
+                if use_bf16 or use_fp16:
+                    precision_type = "bf16" if use_bf16 else "fp16"
+                    fsdp_config["fsdp_state_dict_type"] = "FULL_STATE_DICT"
+                    log_info(f"FSDP using mixed precision: {precision_type}")
+            except ImportError:
+                log_info("FSDP imports failed, falling back to standard DDP")
+                fsdp_config = None
         # Set up training arguments
+        log_info("Setting up training arguments")
         training_args = TrainingArguments(
             output_dir=model_config.get("output_dir", "./results") or model_config.get("checkpointing", {}).get("output_dir", "./results"),
             num_train_epochs=model_config.get("training", {}).get("num_train_epochs", 3),
+            per_device_train_batch_size=per_device_batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
             learning_rate=model_config.get("training", {}).get("learning_rate", 2e-5),
             weight_decay=model_config.get("training", {}).get("weight_decay", 0.01),
             warmup_ratio=model_config.get("training", {}).get("warmup_ratio", 0.05),
             optim=model_config.get("training", {}).get("optim", "adamw_torch"),
             ddp_find_unused_parameters=False,  # Improve distributed training efficiency
             dataloader_drop_last=False,  # Process all examples
+            dataloader_num_workers=2,  # Reduced worker count
             no_cuda=False if torch.cuda.is_available() else True,  # Use CUDA if available
+            fsdp=fsdp_config,  # Add FSDP configuration if available
         )
         # Create sequential sampler to maintain original dataset order
         sequential_sampler = torch.utils.data.SequentialSampler(dataset)
         # Initialize trainer first
+        log_info("Initializing Trainer")
         trainer = Trainer(
             model=model,
             args=training_args,
         # Then override the get_train_dataloader method
         def custom_get_train_dataloader():
             """Custom dataloader that preserves original dataset order"""
+            log_info("Creating sequential dataloader to maintain original dataset order")
             # Calculate batch size based on device availability
             if getattr(training_args, "no_cuda", False):
             else:
                 batch_size = max(training_args.per_device_train_batch_size * max(1, torch.cuda.device_count()), 1)
+            log_info(f"Using sequential sampler with batch size {batch_size}")
             # Return DataLoader with sequential sampler
             return torch.utils.data.DataLoader(
         trainer.get_train_dataloader = custom_get_train_dataloader
         # Start training
+        log_info("=== Starting Training ===")
         try:
+            # Empty cache again right before training
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                log_info("Cleared CUDA cache before training")
+            # Display compact training info
+            total_steps = int(len(dataset) / (per_device_batch_size * torch.cuda.device_count() * gradient_accumulation_steps) * training_args.num_train_epochs)
+            log_info(f"Training plan: {len(dataset)} examples over {training_args.num_train_epochs} epochs ≈ {total_steps} steps")
             trainer.train()
+            log_info("Training completed successfully!")
             # Save the final model
+            log_info("Saving final model...")
             trainer.save_model()
+            log_info(f"Model saved to {training_args.output_dir}")
             # Push to hub if enabled
             if model_config.get("huggingface_hub", {}).get("push_to_hub", False):
+                hub_id = model_config.get("huggingface_hub", {}).get("hub_model_id", "model")
+                log_info(f"Pushing model to Hugging Face Hub as {hub_id}...")
                 trainer.push_to_hub()
+                log_info("Model successfully pushed to Hub")
             return 0
         except Exception as e:
             logger.error(f"Training failed with error: {str(e)}")
+            # Log CUDA memory info if available in compact format
             if torch.cuda.is_available():
+                memory_info = []
                 for i in range(torch.cuda.device_count()):
+                    allocated = torch.cuda.memory_allocated(i) / 1024**2
+                    reserved = torch.cuda.memory_reserved(i) / 1024**2
+                    max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
+                    memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB (max: {max_mem:.1f}MB)")
+                logger.error(f"GPU memory at failure: {', '.join(memory_info)}")
             raise
     except Exception as e:

temp_function_fixes.py ADDED Viewed

	@@ -0,0 +1,230 @@

+def format_phi_chat(messages, dataset_config):
+    """Format messages according to phi-4's chat template and dataset config."""
+    formatted_chat = ""
+    # Get role templates from config
+    roles = dataset_config.get("data_formatting", {}).get("roles", {
+        "system": "System: {content}\n\n",
+        "human": "Human: {content}\n\n",
+        "user": "Human: {content}\n\n",
+        "assistant": "Assistant: {content}\n\n"
+    })
+    # Handle research introduction metadata first
+    metadata = next((msg for msg in messages if isinstance(msg, dict) and
+                    "[RESEARCH INTRODUCTION]" in msg.get("content", "")), None)
+    if metadata:
+        system_template = roles.get("system", "System: {content}\n\n")
+        formatted_chat = system_template.format(content=metadata['content'])
+        messages = [msg for msg in messages if msg != metadata]
+    # Process remaining messages
+    for message in messages:
+        if not isinstance(message, dict) or "content" not in message:
+            logger.warning(f"Skipping invalid message format: {message}")
+            continue
+        role = message.get("role", "").lower()
+        content = message.get("content", "")
+        # Format based on role
+        if role == "human" or role == "user":
+            template = roles.get("user", roles.get("human", "Human: {content}\n\n"))
+            formatted_chat += template.format(content=content)
+        elif role == "assistant" or role == "bot":
+            template = roles.get("assistant", "Assistant: {content}\n\n")
+            formatted_chat += template.format(content=content)
+        elif role == "system":
+            # For system messages, prepend them
+            template = roles.get("system", "System: {content}\n\n")
+            formatted_chat = template.format(content=content) + formatted_chat
+        else:
+            # Default to system for unknown roles
+            logger.warning(f"Unknown role '{role}' - treating as system message")
+            template = roles.get("system", "System: {content}\n\n")
+            formatted_chat += template.format(content=content)
+    return formatted_chat.strip()
+class SimpleDataCollator:
+    def __init__(self, tokenizer, dataset_config):
+        self.tokenizer = tokenizer
+        self.dataset_config = dataset_config
+        self.stats = {"processed": 0, "skipped": 0, "total_tokens": 0}
+        self.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else 0
+        self.max_seq_length = dataset_config.get("dataset", {}).get("processing", {}).get("max_seq_length", 2048)
+        logger.info(f"SimpleDataCollator initialized - using pre-audited dataset with max_seq_length={self.max_seq_length}")
+        logger.info("Using exact dataset structure without reformatting")
+        # Check if we're on GPU
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        logger.info(f"SimpleDataCollator using device: {self.device}")
+    def __call__(self, features):
+        """Process examples preserving exact JSONL structure"""
+        batch = {"input_ids": [], "attention_mask": [], "labels": []}
+        for example in features:
+            try:
+                # Get ID
+                paper_id = example.get("id", "")
+                # Get conversations - these should already contain role and content
+                conversations = example.get("conversations", [])
+                if not conversations:
+                    self.stats["skipped"] += 1
+                    continue
+                # Directly use the conversations array as input to the model's chat template
+                # This preserves the exact structure with roles and content as they are
+                try:
+                    # Let tokenizer handle the content with the model's chat template
+                    inputs = self.tokenizer.apply_chat_template(
+                        conversations,
+                        return_tensors=None,
+                        add_generation_prompt=False
+                    )
+                except Exception as chat_error:
+                    # Fallback if apply_chat_template fails
+                    logger.warning(f"Chat template application failed for example {paper_id}: {str(chat_error)[:100]}")
+                    # Create a basic representation of the conversation
+                    conversation_text = ""
+                    for msg in conversations:
+                        if isinstance(msg, dict) and 'content' in msg:
+                            conversation_text += msg.get('content', '') + "\n\n"
+                    # Basic tokenization
+                    inputs = self.tokenizer(
+                        conversation_text,
+                        add_special_tokens=True,
+                        return_tensors=None
+                    )
+                # Apply length cap if needed (shouldn't be necessary for pre-audited data)
+                if self.max_seq_length > 0 and len(inputs) > self.max_seq_length:
+                    logger.warning(f"Example {paper_id} exceeds max_seq_length ({len(inputs)} > {self.max_seq_length})")
+                    inputs = inputs[:self.max_seq_length]
+                # Create attention mask (1 for all tokens)
+                attention_mask = [1] * len(inputs)
+                if len(inputs) > 0:
+                    # For causal language modeling, labels are the same as inputs
+                    labels = inputs.copy()
+                    batch["input_ids"].append(inputs)
+                    batch["attention_mask"].append(attention_mask)
+                    batch["labels"].append(labels)
+                    self.stats["processed"] += 1
+                    self.stats["total_tokens"] += len(inputs)
+                    # Debug logging for first few examples
+                    log_samples = self.dataset_config.get("validation", {}).get("log_samples", 3)
+                    if self.stats["processed"] <= log_samples:
+                        logger.info(f"Example {self.stats['processed']}:")
+                        logger.info(f"Paper ID: {paper_id}")
+                        logger.info(f"Token count: {len(inputs)}")
+                        logger.info(f"Conversation entries: {len(conversations)}")
+                else:
+                    self.stats["skipped"] += 1
+            except Exception as e:
+                logger.warning(f"Error processing example: {str(e)[:100]}...")
+                logger.warning(f"Problematic example ID: {example.get('id', 'unknown')}")
+                self.stats["skipped"] += 1
+                continue
+        if not batch["input_ids"]:
+            logger.warning("Empty batch, returning dummy tensors")
+            return {
+                "input_ids": torch.zeros((1, 1), dtype=torch.long),
+                "attention_mask": torch.zeros((1, 1), dtype=torch.long),
+                "labels": torch.zeros((1, 1), dtype=torch.long)
+            }
+        # Pad the batch
+        max_length = max(len(ids) for ids in batch["input_ids"])
+        for i in range(len(batch["input_ids"])):
+            padding_length = max_length - len(batch["input_ids"][i])
+            if padding_length > 0:
+                batch["input_ids"][i].extend([self.pad_token_id] * padding_length)
+                batch["attention_mask"][i].extend([0] * padding_length)
+                batch["labels"][i].extend([-100] * padding_length)
+        # Convert to tensors
+        batch = {k: torch.tensor(v, dtype=torch.long) for k, v in batch.items()}
+        # Log stats periodically
+        log_interval = self.dataset_config.get("validation", {}).get("log_interval", 100)
+        if self.stats["processed"] % log_interval == 0 and self.stats["processed"] > 0:
+            logger.info(f"Data collator stats: processed={self.stats['processed']}, "
+                       f"skipped={self.stats['skipped']}, "
+                       f"avg_tokens={self.stats['total_tokens']/self.stats['processed']:.1f}")
+        return batch
+class LoggingCallback(TrainerCallback):
+    def __init__(self):
+        self.last_log_time = time.time()
+        self.last_memory_log_time = time.time()
+    def on_step_end(self, args, state, control, **kwargs):
+        # Log every 50 steps or every 5 minutes, whichever comes first
+        current_time = time.time()
+        # Log loss every 50 steps or 5 minutes
+        if (state.global_step % 50 == 0) or (current_time - self.last_log_time > 300):
+            if state.log_history:
+                loss = state.log_history[-1].get('loss', 'N/A')
+                # Use simple formatting for better HF Space log compatibility
+                log_info(f"Step {state.global_step}: Loss {loss}")
+            else:
+                log_info(f"Step {state.global_step}: No loss data available")
+            self.last_log_time = current_time
+        # Log memory usage every 15 minutes
+        if current_time - self.last_memory_log_time > 900:  # 15 minutes
+            if torch.cuda.is_available():
+                memory_info = []
+                for i in range(torch.cuda.device_count()):
+                    allocated = torch.cuda.memory_allocated(i) / 1024**2
+                    reserved = torch.cuda.memory_reserved(i) / 1024**2
+                    memory_info.append(f"GPU {i}: {allocated:.1f}MB/{reserved:.1f}MB")
+                # Log in compact format for better visibility
+                log_info(f"Memory usage - {', '.join(memory_info)}")
+            self.last_memory_log_time = current_time
+    def on_train_begin(self, args, state, control, **kwargs):
+        log_info("=== Training is starting ===")
+        # Log important training parameters for visibility
+        log_info(f"Batch size: {args.per_device_train_batch_size} × {args.gradient_accumulation_steps} steps × {max(1, torch.cuda.device_count())} GPUs")
+        log_info(f"Learning rate: {args.learning_rate}")
+        log_info(f"Epochs: {args.num_train_epochs}")
+        # Log memory information in compact format
+        if torch.cuda.is_available():
+            memory_info = []
+            for i in range(torch.cuda.device_count()):
+                allocated = torch.cuda.memory_allocated(i) / 1024**2
+                max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
+                memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
+            log_info(f"Initial memory usage - {', '.join(memory_info)}")
+    def on_train_end(self, args, state, control, **kwargs):
+        log_info("=== Training completed ===")
+        if torch.cuda.is_available():
+            memory_info = []
+            for i in range(torch.cuda.device_count()):
+                allocated = torch.cuda.memory_allocated(i) / 1024**2
+                max_mem = torch.cuda.max_memory_allocated(i) / 1024**2
+                memory_info.append(f"GPU {i}: {allocated:.1f}MB (max: {max_mem:.1f}MB)")
+            log_info(f"Final memory usage - {', '.join(memory_info)}")
+        log_info(f"Total steps: {state.global_step}")
+        log_info(f"Final loss: {state.log_history[-1].get('loss', 'N/A') if state.log_history else 'N/A'}")

transformers_config.json CHANGED Viewed

@@ -13,8 +13,8 @@
   },
   "training": {
-    "per_device_train_batch_size": 24,
-    "gradient_accumulation_steps": 2,
     "learning_rate": 2e-5,
     "num_train_epochs": 3,
     "max_steps": -1,
@@ -67,7 +67,7 @@
       "offload_params": false
     },
     "ddp_find_unused_parameters": false,
-    "dataloader_num_workers": 4
   },
   "logging": {

   },
   "training": {
+    "per_device_train_batch_size": 16,
+    "gradient_accumulation_steps": 3,
     "learning_rate": 2e-5,
     "num_train_epochs": 3,
     "max_steps": -1,
       "offload_params": false
     },
     "ddp_find_unused_parameters": false,
+    "dataloader_num_workers": 2
   },
   "logging": {