Spaces:

Proximile
/

Gemma3-Chat

Running

App Files Files Community

ProximileAdmin commited on 27 days ago

Commit

3626d35

verified ·

1 Parent(s): 9aea266

Update app.py

Browse files

Files changed (1) hide show

app.py +323 -201

app.py CHANGED Viewed

@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 """
-Gradio Interface for Multimodal Chat with SSH Tunnel Keepalive and API Fallback
 This application provides a Gradio web interface for multimodal chat with a
-local vLLM model. It establishes an SSH tunnel to a local vLLM server and
-provides fallback to Hyperbolic API if that server is unavailable.
 """
 import os
@@ -13,6 +13,7 @@ import threading
 import logging
 import base64
 import json
 from io import BytesIO
 import gradio as gr
 from openai import OpenAI
@@ -31,7 +32,9 @@ SSH_PORT = int(os.environ.get('SSH_PORT', 22))
 SSH_USERNAME = os.environ.get('SSH_USERNAME')
 SSH_PASSWORD = os.environ.get('SSH_PASSWORD')
 REMOTE_PORT = int(os.environ.get('REMOTE_PORT', 8000))  # vLLM API port on remote machine
-LOCAL_PORT = int(os.environ.get('LOCAL_PORT', 8020))  # Local forwarded port
 VLLM_MODEL = os.environ.get('MODEL_NAME', 'google/gemma-3-27b-it')
 HYPERBOLIC_KEY = os.environ.get('HYPERBOLIC_XYZ_KEY')
 FALLBACK_MODEL = 'Qwen/Qwen2.5-VL-72B-Instruct'  # Fallback model at Hyperbolic
@@ -42,27 +45,36 @@ MAX_CONCURRENT = int(os.environ.get('MAX_CONCURRENT', 3))  # Default to 3 concur
 # API endpoints
 VLLM_ENDPOINT = "http://localhost:" + str(LOCAL_PORT) + "/v1"
 HYPERBOLIC_ENDPOINT = "https://api.hyperbolic.xyz/v1"
 # Global variables
-tunnel = None
 use_fallback = False  # Whether to use fallback API instead of local vLLM
-tunnel_status = {"is_running": False, "message": "Initializing tunnel..."}
-def start_ssh_tunnel():
     """
-    Start the SSH tunnel and monitor its status.
     """
-    global tunnel, use_fallback, tunnel_status
     if not all([SSH_HOST, SSH_USERNAME, SSH_PASSWORD]):
         logger.error("Missing SSH connection details. Falling back to Hyperbolic API.")
         use_fallback = True
-        tunnel_status = {"is_running": False, "message": "Missing SSH credentials"}
         return
     try:
-        logger.info("Starting SSH tunnel...")
-        tunnel = SSHTunnel(
             ssh_host=SSH_HOST,
             ssh_port=SSH_PORT,
             username=SSH_USERNAME,
@@ -73,19 +85,41 @@ def start_ssh_tunnel():
             keep_alive_interval=15
         )
-        if tunnel.start():
-            logger.info("SSH tunnel started successfully")
-            use_fallback = False
-            tunnel_status = {"is_running": True, "message": "Connected"}
         else:
-            logger.warning("Failed to start SSH tunnel. Falling back to Hyperbolic API.")
             use_fallback = True
-            tunnel_status = {"is_running": False, "message": "Connection failed"}
     except Exception as e:
-        logger.error(f"Error starting SSH tunnel: {str(e)}")
         use_fallback = True
-        tunnel_status = {"is_running": False, "message": "Connection error"}
 def check_vllm_api_health():
     """
@@ -95,7 +129,6 @@ def check_vllm_api_health():
         tuple: (is_healthy, message)
     """
     try:
-        import requests
         response = requests.get(f"{VLLM_ENDPOINT}/models", timeout=5)
         if response.status_code == 200:
             try:
@@ -112,119 +145,84 @@ def check_vllm_api_health():
     except Exception as e:
         return False, f"API request failed: {str(e)}"
-def monitor_tunnel():
     """
-    Monitor the SSH tunnel status and update the global variables.
-    """
-    global tunnel, use_fallback, tunnel_status
-    logger.info("Starting tunnel monitoring thread")
-    while True:
-        try:
-            if tunnel is not None:
-                ssh_status = tunnel.check_status()
-                # Check if the tunnel is running
-                if ssh_status["is_running"]:
-                    # Check if vLLM API is actually responding
-                    is_healthy, message = check_vllm_api_health()
-                    if is_healthy:
-                        use_fallback = False
-                        tunnel_status = {
-                            "is_running": True,
-                            "message": f"Connected and healthy. {message}"
-                        }
-                    else:
-                        use_fallback = True
-                        tunnel_status = {
-                            "is_running": False,
-                            "message": "Tunnel connected but vLLM API unhealthy"
-                        }
-                else:
-                    # Log the actual error for troubleshooting but don't expose it in the UI
-                    logger.error(f"SSH tunnel disconnected: {ssh_status['error'] or 'Unknown error'}")
-                    use_fallback = True
-                    tunnel_status = {
-                        "is_running": False,
-                        "message": "Disconnected - Check server status"
-                    }
-            else:
-                use_fallback = True
-                tunnel_status = {"is_running": False, "message": "Tunnel not initialized"}
-        except Exception as e:
-            logger.error(f"Error monitoring tunnel: {str(e)}")
-            use_fallback = True
-            tunnel_status = {"is_running": False, "message": "Monitoring error"}
-        time.sleep(5)  # Check every 5 seconds
-def get_openai_client(use_fallback_api=None):
-    """
-    Create and return an OpenAI client configured for the appropriate endpoint.
-    Args:
-        use_fallback_api (bool): If True, use Hyperbolic API. If False, use local vLLM.
-                                 If None, use the global use_fallback setting.
     Returns:
-        OpenAI: Configured OpenAI client
     """
-    global use_fallback
-    # Determine which API to use
-    if use_fallback_api is None:
-        use_fallback_api = use_fallback
-    if use_fallback_api:
-        logger.info("Using Hyperbolic API")
-        return OpenAI(
-            api_key=HYPERBOLIC_KEY,
-            base_url=HYPERBOLIC_ENDPOINT
-        )
-    else:
-        logger.info("Using local vLLM API")
-        return OpenAI(
-            api_key="EMPTY",  # vLLM doesn't require an actual API key
-            base_url=VLLM_ENDPOINT
-        )
-def get_model_name(use_fallback_api=None):
     """
-    Return the appropriate model name based on the API being used.
-    Args:
-        use_fallback_api (bool): If True, use fallback model. If None, use the global setting.
     Returns:
-        str: Model name
     """
-    global use_fallback
-    if use_fallback_api is None:
-        use_fallback_api = use_fallback
-    return FALLBACK_MODEL if use_fallback_api else VLLM_MODEL
-def convert_files_to_base64(files):
     """
-    Convert uploaded files to base64 strings.
-    Args:
-        files (list): List of file paths
-    Returns:
-        list: List of base64-encoded strings
-    """
-    base64_images = []
-    for file in files:
-        with open(file, "rb") as image_file:
-            # Read image data and encode to base64
-            base64_data = base64.b64encode(image_file.read()).decode("utf-8")
-            base64_images.append(base64_data)
-    return base64_images
 def process_chat(message_dict, history):
     """
@@ -242,39 +240,27 @@ def process_chat(message_dict, history):
     text = message_dict.get("text", "")
     files = message_dict.get("files", [])
-    # Add user message to history first
     if not history:
         history = []
-    # Add user message to chat history
     if files:
-        # For each file, add a separate user message
         for file in files:
             history.append({"role": "user", "content": (file,)})
-    # Add text message if not empty
     if text.strip():
         history.append({"role": "user", "content": text})
     else:
-        # If no text but files exist, don't add an empty message
         if not files:
             history.append({"role": "user", "content": ""})
-    # Convert all files to base64
     base64_images = convert_files_to_base64(files)
-    # Prepare conversation history in OpenAI format
     openai_messages = []
-    # Convert history to OpenAI format
     for h in history:
         if h["role"] == "user":
-            # Handle user messages
             if isinstance(h["content"], tuple):
-                # This is a file-only message, skip for now
                 continue
             else:
-                # Text message
                 openai_messages.append({
                     "role": "user",
                     "content": h["content"]
@@ -285,21 +271,12 @@ def process_chat(message_dict, history):
                 "content": h["content"]
             })
-    # Handle images for the last user message if needed
     if base64_images:
-        # Update the last user message to include image content
         if openai_messages and openai_messages[-1]["role"] == "user":
-            # Get the last message
             last_msg = openai_messages[-1]
-            # Format for OpenAI multimodal content structure
             content_list = []
-            # Add text if there is any
             if last_msg["content"]:
                 content_list.append({"type": "text", "text": last_msg["content"]})
-            # Add images
             for img_b64 in base64_images:
                 content_list.append({
                     "type": "image_url",
@@ -307,37 +284,29 @@ def process_chat(message_dict, history):
                         "url": f"data:image/jpeg;base64,{img_b64}"
                     }
                 })
-            # Replace the content with the multimodal content list
             last_msg["content"] = content_list
-    # Try primary API first, fall back if needed
     try:
-        # First try with the currently selected API (vLLM or fallback)
         client = get_openai_client()
         model = get_model_name()
         response = client.chat.completions.create(
             model=model,
             messages=openai_messages,
-            stream=True  # Use streaming for better UX
         )
-        # Stream the response
         assistant_message = ""
         for chunk in response:
             if hasattr(chunk.choices[0].delta, 'content') and chunk.choices[0].delta.content is not None:
                 assistant_message += chunk.choices[0].delta.content
-                # Update in real-time
                 history_with_stream = history.copy()
                 history_with_stream.append({"role": "assistant", "content": assistant_message})
                 yield history_with_stream
-        # Ensure we have the final message added
         if not assistant_message:
             assistant_message = "No response received from the model."
-        # Add assistant response to history if not already added
         if not history or history[-1]["role"] != "assistant":
             history.append({"role": "assistant", "content": assistant_message})
@@ -345,8 +314,6 @@ def process_chat(message_dict, history):
     except Exception as primary_error:
         logger.error(f"Primary API error: {str(primary_error)}")
-        # If we're not already using fallback, try that
         if not use_fallback:
             try:
                 logger.info("Falling back to Hyperbolic API")
@@ -359,27 +326,21 @@ def process_chat(message_dict, history):
                     stream=True
                 )
-                # Stream the response
                 assistant_message = ""
                 for chunk in response:
                     if hasattr(chunk.choices[0].delta, 'content') and chunk.choices[0].delta.content is not None:
                         assistant_message += chunk.choices[0].delta.content
-                        # Update in real-time
                         history_with_stream = history.copy()
                         history_with_stream.append({"role": "assistant", "content": assistant_message})
                         yield history_with_stream
-                # Ensure we have the final message added
                 if not assistant_message:
                     assistant_message = "No response received from the fallback model."
-                # Add assistant response to history if not already added
                 if not history or history[-1]["role"] != "assistant":
                     history.append({"role": "assistant", "content": assistant_message})
-                # Update fallback status (global already declared at function start)
                 use_fallback = True
                 return history
             except Exception as fallback_error:
@@ -388,24 +349,195 @@ def process_chat(message_dict, history):
                 history.append({"role": "assistant", "content": error_msg})
                 return history
         else:
-            # Already using fallback, just report the error
             error_msg = "An error occurred with the model service."
             history.append({"role": "assistant", "content": error_msg})
             return history
 def get_tunnel_status_message():
     """
     Return a formatted status message for display in the UI.
     """
-    global tunnel_status, use_fallback, MAX_CONCURRENT
     api_mode = "Hyperbolic API" if use_fallback else "Local vLLM API"
     model = get_model_name()
-    status_color = "🟢" if (tunnel_status["is_running"] and not use_fallback) else "🔴"
-    status_text = tunnel_status["message"]
-    return f"{status_color} Tunnel Status: {status_text}\nCurrent API: {api_mode}\nCurrent Model: {model}\nConcurrent Requests: {MAX_CONCURRENT}"
 def toggle_api():
     """
@@ -413,10 +545,8 @@ def toggle_api():
     """
     global use_fallback
     use_fallback = not use_fallback
     api_mode = "Hyperbolic API" if use_fallback else "Local vLLM API"
     model = get_model_name()
     return f"Switched to {api_mode} using {model}"
 def update_concurrency(new_value):
@@ -434,29 +564,20 @@ def update_concurrency(new_value):
         value = int(new_value)
         if value < 1:
             return f"Error: Concurrency must be at least 1. Keeping current value: {MAX_CONCURRENT}"
         MAX_CONCURRENT = value
-        # Note: This only updates the value for future event handlers
-        # Existing event handlers keep their original concurrency_limit
-        # A page refresh is needed for this to fully take effect
         return f"Concurrency updated to {MAX_CONCURRENT}. You may need to refresh the page for all changes to take effect."
     except ValueError:
         return f"Error: Invalid number. Keeping current value: {MAX_CONCURRENT}"
-# Start the SSH tunnel in a background thread
 if __name__ == "__main__":
-    # Start the SSH tunnel
-    start_ssh_tunnel()
-    # Start the monitoring thread
-    monitor_thread = threading.Thread(target=monitor_tunnel, daemon=True)
     monitor_thread.start()
-    # Create Gradio application with Blocks for more control
     with gr.Blocks(theme="soft") as demo:
         gr.Markdown("# Multimodal Chat Interface")
-        # Create chatbot component with message type
         chatbot = gr.Chatbot(
             label="Conversation",
             type="messages",
@@ -465,7 +586,6 @@ if __name__ == "__main__":
             height=400
         )
-        # Create multimodal textbox for input
         with gr.Row():
             textbox = gr.MultimodalTextbox(
                 file_types=["image", "video"],
@@ -477,84 +597,86 @@ if __name__ == "__main__":
             )
             submit_btn = gr.Button("Send", size="sm", scale=1)
-        # Clear button
         clear_btn = gr.Button("Clear Chat")
-        # Set up submit event chain with concurrency limit
         submit_event = textbox.submit(
             fn=process_chat,
             inputs=[textbox, chatbot],
             outputs=chatbot,
-            concurrency_limit=MAX_CONCURRENT  # Set concurrency limit for this event
         ).then(
             fn=lambda: {"text": "", "files": []},
             inputs=None,
             outputs=textbox
         )
-        # Connect the submit button to the same functions with same concurrency limit
         submit_btn.click(
             fn=process_chat,
             inputs=[textbox, chatbot],
             outputs=chatbot,
-            concurrency_limit=MAX_CONCURRENT  # Set concurrency limit for this event
         ).then(
             fn=lambda: {"text": "", "files": []},
             inputs=None,
             outputs=textbox
         )
-        # Set up clear button
         clear_btn.click(lambda: [], None, chatbot)
-        # Load example images if they exist
-        examples = []
-        # Define example images with paths
         example_images = {
             "dog_pic.jpg": "What breed is this?",
             "ghostimg.png": "What's in this image?",
             "newspaper.png": "Provide a python list of dicts about everything on this page."
         }
-        # Check each image and add to examples if it exists
         for img_name, prompt_text in example_images.items():
             img_path = os.path.join(os.path.dirname(__file__), img_name)
             if os.path.exists(img_path):
                 examples.append([{"text": prompt_text, "files": [img_path]}])
-        # Add examples if we have any
         if examples:
             gr.Examples(
                 examples=examples,
                 inputs=textbox
             )
-        # Add status display
         status_text = gr.Textbox(
             label="Tunnel and API Status",
             value=get_tunnel_status_message(),
             interactive=False
         )
-        # Refresh status button and toggle API button
         with gr.Row():
             refresh_btn = gr.Button("Refresh Status")
-        # Set up refresh status button
         refresh_btn.click(
             fn=get_tunnel_status_message,
             inputs=None,
             outputs=status_text
         )
-        # Just load the initial status without auto-refresh
         demo.load(
             fn=get_tunnel_status_message,
             inputs=None,
             outputs=status_text
         )
-    # Launch the interface with the specified concurrency setting
     demo.queue(default_concurrency_limit=MAX_CONCURRENT)
     demo.launch()

 #!/usr/bin/env python3
 """
+Gradio Interface for Multimodal Chat with SSH Tunnel Keepalive, GPU Monitoring, and API Fallback
 This application provides a Gradio web interface for multimodal chat with a
+local vLLM model. It establishes SSH tunnels to a local vLLM server and
+the nvidia-smi monitoring endpoint, with fallback to Hyperbolic API if needed.
 """
 import os
 import logging
 import base64
 import json
+import requests
 from io import BytesIO
 import gradio as gr
 from openai import OpenAI
 SSH_USERNAME = os.environ.get('SSH_USERNAME')
 SSH_PASSWORD = os.environ.get('SSH_PASSWORD')
 REMOTE_PORT = int(os.environ.get('REMOTE_PORT', 8000))  # vLLM API port on remote machine
+LOCAL_PORT = int(os.environ.get('LOCAL_PORT', 8020))      # Local forwarded port
+GPU_REMOTE_PORT = 5000   # GPU monitoring endpoint on remote machine
+GPU_LOCAL_PORT = 5020    # Local forwarded port for GPU monitoring
 VLLM_MODEL = os.environ.get('MODEL_NAME', 'google/gemma-3-27b-it')
 HYPERBOLIC_KEY = os.environ.get('HYPERBOLIC_XYZ_KEY')
 FALLBACK_MODEL = 'Qwen/Qwen2.5-VL-72B-Instruct'  # Fallback model at Hyperbolic
 # API endpoints
 VLLM_ENDPOINT = "http://localhost:" + str(LOCAL_PORT) + "/v1"
 HYPERBOLIC_ENDPOINT = "https://api.hyperbolic.xyz/v1"
+GPU_JSON_ENDPOINT = "http://localhost:" + str(GPU_LOCAL_PORT) + "/gpu/json"
+GPU_TXT_ENDPOINT = "http://localhost:" + str(GPU_LOCAL_PORT) + "/gpu/txt"  # For backward compatibility
 # Global variables
+api_tunnel = None
+gpu_tunnel = None
 use_fallback = False  # Whether to use fallback API instead of local vLLM
+api_tunnel_status = {"is_running": False, "message": "Initializing API tunnel..."}
+gpu_tunnel_status = {"is_running": False, "message": "Initializing GPU monitoring tunnel..."}
+gpu_data = {"timestamp": "", "gpus": [], "processes": [], "success": False}
+gpu_monitor_thread = None
+gpu_monitor_running = False
+def start_ssh_tunnels():
     """
+    Start the SSH tunnels and monitor their status.
     """
+    global api_tunnel, gpu_tunnel, use_fallback, api_tunnel_status, gpu_tunnel_status
     if not all([SSH_HOST, SSH_USERNAME, SSH_PASSWORD]):
         logger.error("Missing SSH connection details. Falling back to Hyperbolic API.")
         use_fallback = True
+        api_tunnel_status = {"is_running": False, "message": "Missing SSH credentials"}
+        gpu_tunnel_status = {"is_running": False, "message": "Missing SSH credentials"}
         return
     try:
+        # Start API tunnel
+        logger.info("Starting API SSH tunnel...")
+        api_tunnel = SSHTunnel(
             ssh_host=SSH_HOST,
             ssh_port=SSH_PORT,
             username=SSH_USERNAME,
             keep_alive_interval=15
         )
+        if api_tunnel.start():
+            logger.info("API SSH tunnel started successfully")
+            api_tunnel_status = {"is_running": True, "message": "Connected"}
         else:
+            logger.warning("Failed to start API SSH tunnel. Falling back to Hyperbolic API.")
             use_fallback = True
+            api_tunnel_status = {"is_running": False, "message": "Connection failed"}
+        # Start GPU monitoring tunnel
+        logger.info("Starting GPU monitoring SSH tunnel...")
+        gpu_tunnel = SSHTunnel(
+            ssh_host=SSH_HOST,
+            ssh_port=SSH_PORT,
+            username=SSH_USERNAME,
+            password=SSH_PASSWORD,
+            remote_port=GPU_REMOTE_PORT,
+            local_port=GPU_LOCAL_PORT,
+            reconnect_interval=30,
+            keep_alive_interval=15
+        )
+        if gpu_tunnel.start():
+            logger.info("GPU monitoring SSH tunnel started successfully")
+            gpu_tunnel_status = {"is_running": True, "message": "Connected"}
+            # Start GPU monitoring
+            start_gpu_monitoring()
+        else:
+            logger.warning("Failed to start GPU monitoring SSH tunnel.")
+            gpu_tunnel_status = {"is_running": False, "message": "Connection failed"}
     except Exception as e:
+        logger.error(f"Error starting SSH tunnels: {str(e)}")
         use_fallback = True
+        api_tunnel_status = {"is_running": False, "message": "Connection error"}
+        gpu_tunnel_status = {"is_running": False, "message": "Connection error"}
 def check_vllm_api_health():
     """
         tuple: (is_healthy, message)
     """
     try:
         response = requests.get(f"{VLLM_ENDPOINT}/models", timeout=5)
         if response.status_code == 200:
             try:
     except Exception as e:
         return False, f"API request failed: {str(e)}"
+def fetch_gpu_info():
     """
+    Fetch GPU information from the remote server in JSON format.
     Returns:
+        dict: GPU information or error message
     """
+    global gpu_tunnel_status
+    try:
+        response = requests.get(GPU_JSON_ENDPOINT, timeout=5)
+        if response.status_code == 200:
+            return response.json()
+        else:
+            logger.warning(f"Error fetching GPU info: HTTP {response.status_code}")
+            return {
+                "success": False,
+                "error": f"HTTP Error: {response.status_code}",
+                "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+                "gpus": [],
+                "processes": []
+            }
+    except Exception as e:
+        logger.warning(f"Error fetching GPU info: {str(e)}")
+        return {
+            "success": False,
+            "error": str(e),
+            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+            "gpus": [],
+            "processes": []
+        }
+def fetch_gpu_text():
     """
+    Fetch raw nvidia-smi output from the remote server for backward compatibility.
     Returns:
+        str: nvidia-smi output or error message
     """
+    try:
+        response = requests.get(GPU_TXT_ENDPOINT, timeout=5)
+        if response.status_code == 200:
+            return response.text
+        else:
+            return f"Error fetching GPU info: HTTP {response.status_code}"
+    except Exception as e:
+        return f"Error fetching GPU info: {str(e)}"
+def start_gpu_monitoring():
     """
+    Start the GPU monitoring thread.
+    """
+    global gpu_monitor_thread, gpu_monitor_running, gpu_data
+    if gpu_monitor_running:
+        return
+    gpu_monitor_running = True
+    def monitor_loop():
+        global gpu_data
+        while gpu_monitor_running:
+            try:
+                gpu_data = fetch_gpu_info()
+            except Exception as e:
+                logger.error(f"Error in GPU monitoring loop: {str(e)}")
+                gpu_data = {
+                    "success": False,
+                    "error": str(e),
+                    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+                    "gpus": [],
+                    "processes": []
+                }
+            time.sleep(2)  # Update every 2 seconds
+    gpu_monitor_thread = threading.Thread(target=monitor_loop, daemon=True)
+    gpu_monitor_thread.start()
+    logger.info("GPU monitoring thread started")
 def process_chat(message_dict, history):
     """
     text = message_dict.get("text", "")
     files = message_dict.get("files", [])
     if not history:
         history = []
     if files:
         for file in files:
             history.append({"role": "user", "content": (file,)})
     if text.strip():
         history.append({"role": "user", "content": text})
     else:
         if not files:
             history.append({"role": "user", "content": ""})
     base64_images = convert_files_to_base64(files)
     openai_messages = []
     for h in history:
         if h["role"] == "user":
             if isinstance(h["content"], tuple):
                 continue
             else:
                 openai_messages.append({
                     "role": "user",
                     "content": h["content"]
                 "content": h["content"]
             })
     if base64_images:
         if openai_messages and openai_messages[-1]["role"] == "user":
             last_msg = openai_messages[-1]
             content_list = []
             if last_msg["content"]:
                 content_list.append({"type": "text", "text": last_msg["content"]})
             for img_b64 in base64_images:
                 content_list.append({
                     "type": "image_url",
                         "url": f"data:image/jpeg;base64,{img_b64}"
                     }
                 })
             last_msg["content"] = content_list
     try:
         client = get_openai_client()
         model = get_model_name()
         response = client.chat.completions.create(
             model=model,
             messages=openai_messages,
+            stream=True
         )
         assistant_message = ""
         for chunk in response:
             if hasattr(chunk.choices[0].delta, 'content') and chunk.choices[0].delta.content is not None:
                 assistant_message += chunk.choices[0].delta.content
                 history_with_stream = history.copy()
                 history_with_stream.append({"role": "assistant", "content": assistant_message})
                 yield history_with_stream
         if not assistant_message:
             assistant_message = "No response received from the model."
         if not history or history[-1]["role"] != "assistant":
             history.append({"role": "assistant", "content": assistant_message})
     except Exception as primary_error:
         logger.error(f"Primary API error: {str(primary_error)}")
         if not use_fallback:
             try:
                 logger.info("Falling back to Hyperbolic API")
                     stream=True
                 )
                 assistant_message = ""
                 for chunk in response:
                     if hasattr(chunk.choices[0].delta, 'content') and chunk.choices[0].delta.content is not None:
                         assistant_message += chunk.choices[0].delta.content
                         history_with_stream = history.copy()
                         history_with_stream.append({"role": "assistant", "content": assistant_message})
                         yield history_with_stream
                 if not assistant_message:
                     assistant_message = "No response received from the fallback model."
                 if not history or history[-1]["role"] != "assistant":
                     history.append({"role": "assistant", "content": assistant_message})
                 use_fallback = True
                 return history
             except Exception as fallback_error:
                 history.append({"role": "assistant", "content": error_msg})
                 return history
         else:
             error_msg = "An error occurred with the model service."
             history.append({"role": "assistant", "content": error_msg})
             return history
+def monitor_tunnels():
+    """
+    Monitor the SSH tunnels status and update the global variables.
+    """
+    global api_tunnel, gpu_tunnel, use_fallback, api_tunnel_status, gpu_tunnel_status
+    logger.info("Starting tunnel monitoring thread")
+    while True:
+        try:
+            if api_tunnel is not None:
+                ssh_status = api_tunnel.check_status()
+                if ssh_status["is_running"]:
+                    is_healthy, message = check_vllm_api_health()
+                    if is_healthy:
+                        use_fallback = False
+                        api_tunnel_status = {
+                            "is_running": True,
+                            "message": f"Connected and healthy. {message}"
+                        }
+                    else:
+                        use_fallback = True
+                        api_tunnel_status = {
+                            "is_running": False,
+                            "message": "Tunnel connected but vLLM API unhealthy"
+                        }
+                else:
+                    logger.error(f"API SSH tunnel disconnected: {ssh_status.get('error', 'Unknown error')}")
+                    use_fallback = True
+                    api_tunnel_status = {
+                        "is_running": False,
+                        "message": "Disconnected - Check server status"
+                    }
+            else:
+                use_fallback = True
+                api_tunnel_status = {"is_running": False, "message": "Tunnel not initialized"}
+            if gpu_tunnel is not None:
+                ssh_status = gpu_tunnel.check_status()
+                if ssh_status["is_running"]:
+                    gpu_tunnel_status = {
+                        "is_running": True,
+                        "message": "Connected"
+                    }
+                    if not gpu_monitor_running:
+                        start_gpu_monitoring()
+                else:
+                    logger.error(f"GPU SSH tunnel disconnected: {ssh_status.get('error', 'Unknown error')}")
+                    gpu_tunnel_status = {
+                        "is_running": False,
+                        "message": "Disconnected - Check server status"
+                    }
+            else:
+                gpu_tunnel_status = {"is_running": False, "message": "Tunnel not initialized"}
+        except Exception as e:
+            logger.error(f"Error monitoring tunnels: {str(e)}")
+            use_fallback = True
+            api_tunnel_status = {"is_running": False, "message": "Monitoring error"}
+            gpu_tunnel_status = {"is_running": False, "message": "Monitoring error"}
+        time.sleep(5)  # Check every 5 seconds
+def get_openai_client(use_fallback_api=None):
+    """
+    Create and return an OpenAI client configured for the appropriate endpoint.
+    Args:
+        use_fallback_api (bool): If True, use Hyperbolic API. If False, use local vLLM.
+                                 If None, use the global use_fallback setting.
+    Returns:
+        OpenAI: Configured OpenAI client
+    """
+    global use_fallback
+    if use_fallback_api is None:
+        use_fallback_api = use_fallback
+    if use_fallback_api:
+        logger.info("Using Hyperbolic API")
+        return OpenAI(
+            api_key=HYPERBOLIC_KEY,
+            base_url=HYPERBOLIC_ENDPOINT
+        )
+    else:
+        logger.info("Using local vLLM API")
+        return OpenAI(
+            api_key="EMPTY",  # vLLM doesn't require an actual API key
+            base_url=VLLM_ENDPOINT
+        )
+def get_model_name(use_fallback_api=None):
+    """
+    Return the appropriate model name based on the API being used.
+    Args:
+        use_fallback_api (bool): If True, use fallback model. If None, use the global setting.
+    Returns:
+        str: Model name
+    """
+    global use_fallback
+    if use_fallback_api is None:
+        use_fallback_api = use_fallback
+    return FALLBACK_MODEL if use_fallback_api else VLLM_MODEL
+def convert_files_to_base64(files):
+    """
+    Convert uploaded files to base64 strings.
+    Args:
+        files (list): List of file paths
+    Returns:
+        list: List of base64-encoded strings
+    """
+    base64_images = []
+    for file in files:
+        with open(file, "rb") as image_file:
+            base64_data = base64.b64encode(image_file.read()).decode("utf-8")
+            base64_images.append(base64_data)
+    return base64_images
+def format_simplified_gpu_data(gpu_data):
+    """
+    Format GPU data into a simplified, focused display.
+    Args:
+        gpu_data (dict): GPU data in JSON format
+    Returns:
+        str: Formatted GPU data
+    """
+    if not gpu_data.get("success", False):
+        return f"Error fetching GPU data: {gpu_data.get('error', 'Unknown error')}"
+    output = []
+    output.append(f"Last updated: {gpu_data.get('timestamp', 'Unknown')}")
+    for i, gpu in enumerate(gpu_data.get("gpus", [])):
+        output.append(f"GPU {gpu.get('index', i)}: {gpu.get('name', 'Unknown')}")
+        output.append(f"  Memory: {gpu.get('memory_used', 0):6.0f} MB / {gpu.get('memory_total', 0):6.0f} MB ({gpu.get('memory_utilization', 0):5.1f}%)")
+        output.append(f"  Power:  {gpu.get('power_draw', 0):5.1f}W / {gpu.get('power_limit', 0):5.1f}W")
+        if 'fan_speed' in gpu:
+            output.append(f"  Fan:    {gpu.get('fan_speed', 0):5.1f}%")
+        output.append(f"  Temp:   {gpu.get('temperature', 0):5.1f}°C")
+        output.append("")
+    return "\n".join(output)
+def update_gpu_status():
+    """
+    Fetch and format the current GPU status.
+    Returns:
+        str: Formatted GPU status
+    """
+    global gpu_data, gpu_tunnel_status
+    if not gpu_tunnel_status["is_running"]:
+        return "GPU monitoring tunnel is not connected."
+    return format_simplified_gpu_data(gpu_data)
 def get_tunnel_status_message():
     """
     Return a formatted status message for display in the UI.
     """
+    global api_tunnel_status, gpu_tunnel_status, use_fallback, MAX_CONCURRENT
     api_mode = "Hyperbolic API" if use_fallback else "Local vLLM API"
     model = get_model_name()
+    api_status_color = "🟢" if (api_tunnel_status["is_running"] and not use_fallback) else "🔴"
+    api_status_text = api_tunnel_status["message"]
+    gpu_status_color = "🟢" if gpu_tunnel_status["is_running"] else "🔴"
+    gpu_status_text = gpu_tunnel_status["message"]
+    return (f"{api_status_color} API Tunnel: {api_status_text}\n"
+            f"{gpu_status_color} GPU Tunnel: {gpu_status_text}\n"
+            f"Current API: {api_mode}\n"
+            f"Current Model: {model}\n"
+            f"Concurrent Requests: {MAX_CONCURRENT}")
+def get_gpu_json():
+    """
+    Return the raw GPU JSON data for debugging.
+    """
+    global gpu_data
+    return json.dumps(gpu_data, indent=2)
 def toggle_api():
     """
     """
     global use_fallback
     use_fallback = not use_fallback
     api_mode = "Hyperbolic API" if use_fallback else "Local vLLM API"
     model = get_model_name()
     return f"Switched to {api_mode} using {model}"
 def update_concurrency(new_value):
         value = int(new_value)
         if value < 1:
             return f"Error: Concurrency must be at least 1. Keeping current value: {MAX_CONCURRENT}"
         MAX_CONCURRENT = value
         return f"Concurrency updated to {MAX_CONCURRENT}. You may need to refresh the page for all changes to take effect."
     except ValueError:
         return f"Error: Invalid number. Keeping current value: {MAX_CONCURRENT}"
+# Start SSH tunnels and monitoring threads
 if __name__ == "__main__":
+    start_ssh_tunnels()
+    monitor_thread = threading.Thread(target=monitor_tunnels, daemon=True)
     monitor_thread.start()
     with gr.Blocks(theme="soft") as demo:
         gr.Markdown("# Multimodal Chat Interface")
         chatbot = gr.Chatbot(
             label="Conversation",
             type="messages",
             height=400
         )
         with gr.Row():
             textbox = gr.MultimodalTextbox(
                 file_types=["image", "video"],
             )
             submit_btn = gr.Button("Send", size="sm", scale=1)
         clear_btn = gr.Button("Clear Chat")
         submit_event = textbox.submit(
             fn=process_chat,
             inputs=[textbox, chatbot],
             outputs=chatbot,
+            concurrency_limit=MAX_CONCURRENT
         ).then(
             fn=lambda: {"text": "", "files": []},
             inputs=None,
             outputs=textbox
         )
         submit_btn.click(
             fn=process_chat,
             inputs=[textbox, chatbot],
             outputs=chatbot,
+            concurrency_limit=MAX_CONCURRENT
         ).then(
             fn=lambda: {"text": "", "files": []},
             inputs=None,
             outputs=textbox
         )
         clear_btn.click(lambda: [], None, chatbot)
+        examples = []
         example_images = {
             "dog_pic.jpg": "What breed is this?",
             "ghostimg.png": "What's in this image?",
             "newspaper.png": "Provide a python list of dicts about everything on this page."
         }
         for img_name, prompt_text in example_images.items():
             img_path = os.path.join(os.path.dirname(__file__), img_name)
             if os.path.exists(img_path):
                 examples.append([{"text": prompt_text, "files": [img_path]}])
         if examples:
             gr.Examples(
                 examples=examples,
                 inputs=textbox
             )
         status_text = gr.Textbox(
             label="Tunnel and API Status",
             value=get_tunnel_status_message(),
             interactive=False
         )
+        with gr.Accordion("GPU Status", open=False):
+            # Changed from Textbox to HTML component
+            gpu_status = gr.HTML(
+                value=lambda: f"<pre style='font-family: monospace; white-space: pre; overflow: auto;'>{update_gpu_status()}</pre>",
+                every=2
+            )
         with gr.Row():
             refresh_btn = gr.Button("Refresh Status")
+            toggle_api_btn = gr.Button("Toggle API")
         refresh_btn.click(
             fn=get_tunnel_status_message,
             inputs=None,
             outputs=status_text
         )
+        toggle_api_btn.click(
+            fn=toggle_api,
+            inputs=None,
+            outputs=status_text
+        ).then(
+            fn=get_tunnel_status_message,
+            inputs=None,
+            outputs=status_text
+        )
         demo.load(
             fn=get_tunnel_status_message,
             inputs=None,
             outputs=status_text
         )
     demo.queue(default_concurrency_limit=MAX_CONCURRENT)
     demo.launch()