{ "model": { "name": "unsloth/phi-4-unsloth-bnb-4bit", "trust_remote_code": true, "use_fast_tokenizer": true }, "tokenizer": { "chat_template": "phi", "max_seq_length": 2048, "padding_side": "right", "add_eos_token": true }, "training": { "per_device_train_batch_size": 16, "gradient_accumulation_steps": 3, "learning_rate": 2e-5, "num_train_epochs": 3, "max_steps": -1, "logging_steps": 10, "save_steps": 200, "save_total_limit": 5, "push_to_hub": true, "hub_strategy": "every_save", "gradient_checkpointing": true, "optim": "adamw_torch", "lr_scheduler_type": "cosine", "warmup_ratio": 0.05, "weight_decay": 0.01, "max_grad_norm": 1.0, "neftune_noise_alpha": 5, "fp16": false, "bf16": true }, "checkpointing": { "output_dir": "./results", "save_strategy": "steps", "save_steps": 100, "save_total_limit": 3, "hub_strategy": "every_save" }, "unsloth": { "enabled": true, "r": 32, "alpha": 16, "dropout": 0, "target_modules": [ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj" ] }, "distributed_training": { "fsdp_config": { "enabled": false, "sharding_strategy": "FULL_SHARD", "mixed_precision": "BF16", "activation_checkpointing": true, "offload_params": false }, "ddp_find_unused_parameters": false, "dataloader_num_workers": 2, "ddp_config": { "enabled": true, "backend": "nccl", "find_unused_parameters": false, "broadcast_buffers": false, "gradient_as_bucket_view": true } }, "logging": { "logging_steps": 50, "log_level": "info" }, "huggingface_hub": { "push_to_hub": true, "hub_model_id": "phi-4-cognitive-assistant", "hub_private_repo": true }, "model_name_or_path": "unsloth/phi-4-unsloth-bnb-4bit", "model_revision": "main", "use_flash_attention": true, "torch_dtype": "bfloat16", "bf16": true, "fp16": false, "hardware": { "hardware_name": "4xL4", "specs": { "gpu_count": 4, "gpu_type": "L4", "vram_per_gpu": 24, "total_vram": 96, "vcpu_count": 48, "ram": 186 }, "hardware_setup": { "use_cpu": false, "num_gpus": 4, "device_map": "auto" }, "training_optimizations": { "per_device_batch_size": 16, "gradient_accumulation_steps": 3, "mixed_precision": "bf16", "torch_compile": false, "memory_optimizations": { "use_gradient_checkpointing": true, "use_flash_attention": true }, "multi_gpu_strategy": "ddp" }, "system_settings": { "cuda_memory_fraction": 0.75, "dataloader_num_workers": 4, "dataloader_pin_memory": true }, "memory_breakdown": { "model_size": "~3.5GB (pre-quantized 4-bit)", "optimizer_states": "~1GB", "batch_memory_per_gpu": "~3GB", "peak_memory_estimate": "~18GB", "safe_headroom": "~6GB" }, "compute_environment": "L4_CLOUD" }, "dataset": { "dataset": { "name": "George-API/phi4-cognitive-dataset", "split": "train" }, "data_formatting": { "chat_template": "phi", "conversation_structure": { "system_identifier": "[RESEARCH INTRODUCTION]", "turn_order": ["human", "assistant"] }, "roles": { "system": "System: {content}\n\n", "human": "Human: {content}\n\n", "assistant": "Assistant: {content}\n\n" } }, "data_loading": { "batch_size": 24, "shuffle": false, "sequential_processing": true, "drop_last": false, "num_workers": 4, "pin_memory": true, "prefetch_factor": 4 }, "validation": { "log_samples": 3, "log_interval": 50, "verify_sequence_integrity": true, "metrics": ["processed", "skipped", "avg_tokens", "unique_articles"] } } }