hf-train-frontend

Runtime error

App Files Files Community

hf-train-frontend / app.py

George-API

Upload folder using huggingface_hub

a90f827 verified about 2 months ago

raw

history blame

8.7 kB

	import os
	import sys
	import json
	import logging
	import gradio as gr
	from pathlib import Path
	import subprocess
	import time
	from datetime import datetime

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(levelname)s - %(message)s",
	handlers=[logging.StreamHandler(sys.stdout)]
	)
	logger = logging.getLogger(__name__)

	# Configuration paths
	CONFIG_DIR = "."
	TRANSFORMERS_CONFIG = os.path.join(CONFIG_DIR, "transformers_config.json")
	HARDWARE_CONFIG = os.path.join(CONFIG_DIR, "hardware_config.json")
	DATASET_CONFIG = os.path.join(CONFIG_DIR, "dataset_config.json")

	def load_config(config_path):
	"""Load configuration from JSON file."""
	try:
	if os.path.exists(config_path):
	with open(config_path, 'r') as f:
	return json.load(f)
	else:
	logger.error(f"Config file not found: {config_path}")
	return None
	except Exception as e:
	logger.error(f"Error loading config: {str(e)}")
	return None

	def display_config():
	"""Display current training configuration."""
	transformers_config = load_config(TRANSFORMERS_CONFIG)
	hardware_config = load_config(HARDWARE_CONFIG)
	dataset_config = load_config(DATASET_CONFIG)

	if not all([transformers_config, hardware_config, dataset_config]):
	return "Error loading configuration files."

	# Extract key parameters
	model_name = transformers_config.get("model", {}).get("name", "")
	dataset_name = dataset_config.get("dataset", {}).get("name", "")
	batch_size = transformers_config.get("training", {}).get("per_device_train_batch_size", 0)
	gradient_accum = transformers_config.get("training", {}).get("gradient_accumulation_steps", 0)
	lr = transformers_config.get("training", {}).get("learning_rate", 0)
	epochs = transformers_config.get("training", {}).get("num_train_epochs", 0)
	gpu_count = hardware_config.get("specs", {}).get("gpu_count", 0)
	gpu_type = hardware_config.get("specs", {}).get("gpu_type", "")

	config_info = f"""
	## Current Training Configuration

	Model: {model_name}
	Dataset: {dataset_name}

	Training Parameters:
	- Learning Rate: {lr}
	- Epochs: {epochs}
	- Batch Size/GPU: {batch_size}
	- Gradient Accumulation: {gradient_accum}
	- Effective Batch Size: {batch_size * gradient_accum * gpu_count}

	Hardware:
	- GPUs: {gpu_count}x {gpu_type}
	- Flash Attention: {hardware_config.get("memory_optimization", {}).get("use_flash_attention", False)}
	- Gradient Checkpointing: {hardware_config.get("memory_optimization", {}).get("use_gradient_checkpointing", False)}

	Pre-quantized 4-bit Training: Enabled
	"""

	return config_info

	def start_training():
	"""Start the training process."""
	try:
	# Check if already running
	if os.path.exists("training.pid"):
	with open("training.pid", "r") as f:
	pid = f.read().strip()
	try:
	# Check if process is still running
	os.kill(int(pid), 0)
	return f"Training is already running with PID {pid}"
	except OSError:
	# Process not running, remove stale PID file
	os.remove("training.pid")

	# Start training in background
	cmd = "python run_transformers_training.py"
	process = subprocess.Popen(
	cmd,
	shell=True,
	stdout=open('training.log', 'a'),
	stderr=subprocess.STDOUT
	)

	# Save PID
	with open("training.pid", "w") as f:
	f.write(str(process.pid))

	# Log start time
	with open("training_history.log", "a") as f:
	f.write(f"{datetime.now().isoformat()}: Training started (PID: {process.pid})\n")

	return f"Training started with PID {process.pid}. Check status for updates."

	except Exception as e:
	return f"Error starting training: {str(e)}"

	def check_training_status():
	"""Check the status of training."""
	try:
	# Check if training is running
	if os.path.exists("training.pid"):
	with open("training.pid", "r") as f:
	pid = f.read().strip()
	try:
	# Check if process is still running
	os.kill(int(pid), 0)
	status = f"Training is running with PID {pid}"
	except OSError:
	status = "Training process has stopped"
	os.remove("training.pid")
	else:
	status = "No training process is currently running"

	# Get last lines from training log
	log_content = "No training log available"
	if os.path.exists("training.log"):
	with open("training.log", "r") as f:
	lines = f.readlines()
	log_content = "".join(lines[-20:]) if lines else "Log file is empty"

	return f"{status}\n\nRecent Log:\n```\n{log_content}\n```"

	except Exception as e:
	return f"Error checking status: {str(e)}"

	# Create the Gradio interface
	with gr.Blocks(title="Phi-4 Unsloth Training", theme=gr.themes.Soft(primary_hue="blue")) as app:
	gr.Markdown("# Phi-4 Unsloth 4-bit Training Interface")

	with gr.Tabs():
	with gr.TabItem("Configuration"):
	config_output = gr.Markdown(display_config())
	refresh_btn = gr.Button("Refresh Configuration")
	refresh_btn.click(fn=display_config, outputs=config_output)

	with gr.TabItem("Training Control"):
	gr.Markdown("## Training Management")

	with gr.Row():
	start_btn = gr.Button("Start Training", variant="primary")
	check_btn = gr.Button("Check Status")

	status_output = gr.Markdown("Click 'Check Status' to see training progress")

	start_btn.click(fn=start_training, outputs=status_output)
	check_btn.click(fn=check_training_status, outputs=status_output)

	# Auto-refresh status
	gr.HTML('''
	<script>
	let intervalId;

	document.addEventListener('DOMContentLoaded', function() {
	// Find the "Check Status" button
	const buttons = Array.from(document.querySelectorAll('button'));
	const checkBtn = buttons.find(btn => btn.textContent.includes('Check Status'));

	// Set up interval to click the button every 30 seconds
	if (checkBtn) {
	intervalId = setInterval(() => {
	checkBtn.click();
	}, 30000);
	}
	});

	// Clean up on tab/window close
	window.addEventListener('beforeunload', function() {
	clearInterval(intervalId);
	});
	</script>
	''')

	with gr.TabItem("Help"):
	gr.Markdown("""
	## Phi-4 Unsloth Training Help

	This interface allows you to manage training of the Phi-4 model with Unsloth 4-bit optimizations.

	### Installation

	Before starting training, ensure all dependencies are installed:

	```bash
	pip install -r requirements.txt
	```

	Critical packages:
	- unsloth (>=2024.3)
	- peft (>=0.9.0)
	- transformers (>=4.36.0)

	### Quick Start

	1. Review the configuration in the Configuration tab
	2. Click "Start Training" to begin the process
	3. Use "Check Status" to monitor progress

	### Notes

	- Training uses the pre-quantized model `unsloth/phi-4-unsloth-bnb-4bit`
	- The process maintains paper order and handles metadata appropriately
	- Training progress will be regularly saved to HuggingFace Hub

	### Troubleshooting

	If training stops unexpectedly:
	- Check the logs for out-of-memory errors
	- Verify the VRAM usage on each GPU
	- Check for CUDA version compatibility
	- If you see "Unsloth not available" error, run: `pip install unsloth>=2024.3 peft>=0.9.0`
	""")

	# Launch the app
	if __name__ == "__main__":
	app.launch()