|
import gradio as gr |
|
from llama_cpp import Llama |
|
from huggingface_hub import hf_hub_download |
|
import threading |
|
|
|
|
|
TITLE = "AI Copilot for Patients" |
|
DESCRIPTION = "I provide answers to concerns related to Health" |
|
|
|
|
|
llm_llama_cpp = None |
|
model_ready = False |
|
|
|
|
|
def load_model(): |
|
global llm_llama_cpp, model_ready |
|
try: |
|
print("Downloading model...") |
|
model_file_path = hf_hub_download( |
|
repo_id="TheBloke/Llama-2-7B-GGUF", |
|
filename="llama-2-7b.Q4_0.gguf" |
|
) |
|
|
|
print("Initializing model...") |
|
llm_llama_cpp = Llama( |
|
model_path=model_file_path, |
|
verbose=False, |
|
n_ctx=4096 |
|
) |
|
model_ready = True |
|
print("Model is ready.") |
|
except Exception as e: |
|
print(f"Failed to load model: {e}") |
|
|
|
|
|
threading.Thread(target=load_model).start() |
|
|
|
|
|
def talk(prompt, history): |
|
if not model_ready: |
|
return "⏳ Please wait, the model is still loading..." |
|
|
|
try: |
|
response = "" |
|
response_stream = llm_llama_cpp.create_completion( |
|
prompt=prompt, |
|
max_tokens=200, |
|
stream=True |
|
) |
|
|
|
for chunk in response_stream: |
|
if 'choices' in chunk and 'text' in chunk['choices'][0]: |
|
response += chunk['choices'][0]['text'] |
|
return response |
|
|
|
except Exception as e: |
|
print(f"Error in generating response: {e}") |
|
return f"Error during response generation: {e}" |
|
|
|
|
|
demo = gr.ChatInterface( |
|
fn=talk, |
|
chatbot=gr.Chatbot( |
|
show_label=True, |
|
show_share_button=True, |
|
show_copy_button=True, |
|
layout="bubble", |
|
type="messages", |
|
), |
|
theme="Soft", |
|
examples=[["what is Diabetes?"]], |
|
title=TITLE, |
|
description=DESCRIPTION, |
|
) |
|
|
|
|
|
demo.launch(share=True) |