chatcpu / main.py
segestic's picture
Rename app.py to main.py
170284a verified
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
import threading
# Title and description
TITLE = "AI Copilot for Patients"
DESCRIPTION = "I provide answers to concerns related to Health"
# Globals
llm_llama_cpp = None
model_ready = False
# Download and initialize model in background
def load_model():
global llm_llama_cpp, model_ready
try:
print("Downloading model...")
model_file_path = hf_hub_download(
repo_id="TheBloke/Llama-2-7B-GGUF",
filename="llama-2-7b.Q4_0.gguf"
)
print("Initializing model...")
llm_llama_cpp = Llama(
model_path=model_file_path,
verbose=False,
n_ctx=4096
)
model_ready = True
print("Model is ready.")
except Exception as e:
print(f"Failed to load model: {e}")
# Background thread for model loading
threading.Thread(target=load_model).start()
# Chatbot logic
def talk(prompt, history):
if not model_ready:
return "⏳ Please wait, the model is still loading..."
try:
response = ""
response_stream = llm_llama_cpp.create_completion(
prompt=prompt,
max_tokens=200,
stream=True
)
for chunk in response_stream:
if 'choices' in chunk and 'text' in chunk['choices'][0]:
response += chunk['choices'][0]['text']
return response
except Exception as e:
print(f"Error in generating response: {e}")
return f"Error during response generation: {e}"
# Gradio interface
demo = gr.ChatInterface(
fn=talk,
chatbot=gr.Chatbot(
show_label=True,
show_share_button=True,
show_copy_button=True,
layout="bubble",
type="messages",
),
theme="Soft",
examples=[["what is Diabetes?"]],
title=TITLE,
description=DESCRIPTION,
)
# Launch the UI
demo.launch(share=True)