Spaces:
Runtime error
Runtime error
import os | |
import sys | |
import json | |
import logging | |
import gradio as gr | |
from pathlib import Path | |
import subprocess | |
import time | |
from datetime import datetime | |
# Configure logging | |
logging.basicConfig( | |
level=logging.INFO, | |
format="%(asctime)s - %(levelname)s - %(message)s", | |
handlers=[logging.StreamHandler(sys.stdout)] | |
) | |
logger = logging.getLogger(__name__) | |
# Configuration paths | |
CONFIG_DIR = "." | |
TRANSFORMERS_CONFIG = os.path.join(CONFIG_DIR, "transformers_config.json") | |
HARDWARE_CONFIG = os.path.join(CONFIG_DIR, "hardware_config.json") | |
DATASET_CONFIG = os.path.join(CONFIG_DIR, "dataset_config.json") | |
def load_config(config_path): | |
"""Load configuration from JSON file.""" | |
try: | |
if os.path.exists(config_path): | |
with open(config_path, 'r') as f: | |
return json.load(f) | |
else: | |
logger.error(f"Config file not found: {config_path}") | |
return None | |
except Exception as e: | |
logger.error(f"Error loading config: {str(e)}") | |
return None | |
def display_config(): | |
"""Display current training configuration.""" | |
transformers_config = load_config(TRANSFORMERS_CONFIG) | |
hardware_config = load_config(HARDWARE_CONFIG) | |
dataset_config = load_config(DATASET_CONFIG) | |
if not all([transformers_config, hardware_config, dataset_config]): | |
return "Error loading configuration files." | |
# Extract key parameters | |
model_name = transformers_config.get("model", {}).get("name", "") | |
dataset_name = dataset_config.get("dataset", {}).get("name", "") | |
batch_size = transformers_config.get("training", {}).get("per_device_train_batch_size", 0) | |
gradient_accum = transformers_config.get("training", {}).get("gradient_accumulation_steps", 0) | |
lr = transformers_config.get("training", {}).get("learning_rate", 0) | |
epochs = transformers_config.get("training", {}).get("num_train_epochs", 0) | |
gpu_count = hardware_config.get("specs", {}).get("gpu_count", 0) | |
gpu_type = hardware_config.get("specs", {}).get("gpu_type", "") | |
config_info = f""" | |
## Current Training Configuration | |
**Model**: {model_name} | |
**Dataset**: {dataset_name} | |
**Training Parameters**: | |
- Learning Rate: {lr} | |
- Epochs: {epochs} | |
- Batch Size/GPU: {batch_size} | |
- Gradient Accumulation: {gradient_accum} | |
- Effective Batch Size: {batch_size * gradient_accum * gpu_count} | |
**Hardware**: | |
- GPUs: {gpu_count}x {gpu_type} | |
- Flash Attention: {hardware_config.get("memory_optimization", {}).get("use_flash_attention", False)} | |
- Gradient Checkpointing: {hardware_config.get("memory_optimization", {}).get("use_gradient_checkpointing", False)} | |
**Pre-quantized 4-bit Training**: Enabled | |
""" | |
return config_info | |
def start_training(): | |
"""Start the training process.""" | |
try: | |
# Check if already running | |
if os.path.exists("training.pid"): | |
with open("training.pid", "r") as f: | |
pid = f.read().strip() | |
try: | |
# Check if process is still running | |
os.kill(int(pid), 0) | |
return f"Training is already running with PID {pid}" | |
except OSError: | |
# Process not running, remove stale PID file | |
os.remove("training.pid") | |
# Start training in background | |
cmd = "python run_transformers_training.py" | |
process = subprocess.Popen( | |
cmd, | |
shell=True, | |
stdout=open('training.log', 'a'), | |
stderr=subprocess.STDOUT | |
) | |
# Save PID | |
with open("training.pid", "w") as f: | |
f.write(str(process.pid)) | |
# Log start time | |
with open("training_history.log", "a") as f: | |
f.write(f"{datetime.now().isoformat()}: Training started (PID: {process.pid})\n") | |
return f"Training started with PID {process.pid}. Check status for updates." | |
except Exception as e: | |
return f"Error starting training: {str(e)}" | |
def check_training_status(): | |
"""Check the status of training.""" | |
try: | |
# Check if training is running | |
if os.path.exists("training.pid"): | |
with open("training.pid", "r") as f: | |
pid = f.read().strip() | |
try: | |
# Check if process is still running | |
os.kill(int(pid), 0) | |
status = f"Training is running with PID {pid}" | |
except OSError: | |
status = "Training process has stopped" | |
os.remove("training.pid") | |
else: | |
status = "No training process is currently running" | |
# Get last lines from training log | |
log_content = "No training log available" | |
if os.path.exists("training.log"): | |
with open("training.log", "r") as f: | |
lines = f.readlines() | |
log_content = "".join(lines[-20:]) if lines else "Log file is empty" | |
return f"{status}\n\n**Recent Log:**\n```\n{log_content}\n```" | |
except Exception as e: | |
return f"Error checking status: {str(e)}" | |
# Create the Gradio interface | |
with gr.Blocks(title="Phi-4 Unsloth Training", theme=gr.themes.Soft(primary_hue="blue")) as app: | |
gr.Markdown("# Phi-4 Unsloth 4-bit Training Interface") | |
with gr.Tabs(): | |
with gr.TabItem("Configuration"): | |
config_output = gr.Markdown(display_config()) | |
refresh_btn = gr.Button("Refresh Configuration") | |
refresh_btn.click(fn=display_config, outputs=config_output) | |
with gr.TabItem("Training Control"): | |
gr.Markdown("## Training Management") | |
with gr.Row(): | |
start_btn = gr.Button("Start Training", variant="primary") | |
check_btn = gr.Button("Check Status") | |
status_output = gr.Markdown("Click 'Check Status' to see training progress") | |
start_btn.click(fn=start_training, outputs=status_output) | |
check_btn.click(fn=check_training_status, outputs=status_output) | |
# Auto-refresh status | |
gr.HTML(''' | |
<script> | |
let intervalId; | |
document.addEventListener('DOMContentLoaded', function() { | |
// Find the "Check Status" button | |
const buttons = Array.from(document.querySelectorAll('button')); | |
const checkBtn = buttons.find(btn => btn.textContent.includes('Check Status')); | |
// Set up interval to click the button every 30 seconds | |
if (checkBtn) { | |
intervalId = setInterval(() => { | |
checkBtn.click(); | |
}, 30000); | |
} | |
}); | |
// Clean up on tab/window close | |
window.addEventListener('beforeunload', function() { | |
clearInterval(intervalId); | |
}); | |
</script> | |
''') | |
with gr.TabItem("Help"): | |
gr.Markdown(""" | |
## Phi-4 Unsloth Training Help | |
This interface allows you to manage training of the Phi-4 model with Unsloth 4-bit optimizations. | |
### Installation | |
Before starting training, ensure all dependencies are installed: | |
```bash | |
pip install -r requirements.txt | |
``` | |
Critical packages: | |
- unsloth (>=2024.3) | |
- peft (>=0.9.0) | |
- transformers (>=4.36.0) | |
### Quick Start | |
1. Review the configuration in the Configuration tab | |
2. Click "Start Training" to begin the process | |
3. Use "Check Status" to monitor progress | |
### Notes | |
- Training uses the pre-quantized model `unsloth/phi-4-unsloth-bnb-4bit` | |
- The process maintains paper order and handles metadata appropriately | |
- Training progress will be regularly saved to HuggingFace Hub | |
### Troubleshooting | |
If training stops unexpectedly: | |
- Check the logs for out-of-memory errors | |
- Verify the VRAM usage on each GPU | |
- Check for CUDA version compatibility | |
- If you see "Unsloth not available" error, run: `pip install unsloth>=2024.3 peft>=0.9.0` | |
""") | |
# Launch the app | |
if __name__ == "__main__": | |
app.launch() | |