Spaces:

diabolic6045
/

japanese-stable-vlm-demo

Running on Zero

App Files Files Community

diabolic6045 commited on Jun 7, 2024

Commit

b13a765

verified ·

1 Parent(s): e387e82

Update app.py

Browse files

Files changed (1) hide show

app.py +16 -27

app.py CHANGED Viewed

@@ -1,13 +1,3 @@
-import gradio as gr
-import os
-import torch
-from transformers import AutoTokenizer, AutoModelForVision2Seq, AutoImageProcessor
-from PIL import Image
-import requests
-import spaces
-from huggingface_hub import login
-login(os.environ["HF_KEY"])
 # Load the model and tokenizer
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = AutoModelForVision2Seq.from_pretrained("stabilityai/japanese-stable-vlm", trust_remote_code=True, device_map='auto')
@@ -42,7 +32,6 @@ def build_prompt(task="caption", input=None, sep="\n\n### "):
     return p
 # Define the function to generate text from the image and prompt
-@spaces.GPU(duration=120)
 def generate_text(image, task, input_text=None):
     prompt = build_prompt(task=task, input=input_text)
     inputs = processor(images=image, return_tensors="pt")
@@ -60,21 +49,21 @@ def generate_text(image, task, input_text=None):
     return generated_text
 # Define the Gradio interface
-image_input = gr.Image(label="Upload an image")
-task_input = gr.Radio(choices=["caption", "tag", "vqa"], value="caption", label="Select a task")
-text_input = gr.Textbox(label="Enter text (for tag or vqa tasks)")
-output = gr.Textbox(label="Generated text")
-interface = gr.Interface(
-    fn=generate_text,
-    inputs=[image_input, task_input, text_input],
-    outputs=output,
-    examples=[
-        ["examples/example_image.jpg", "caption", None],
-        ["examples/example_image.jpg", "tag", "河津桜、青空"],
-        ["examples/example_image.jpg", "vqa", "OCRはできますか？"],
-    ],
-)
-interface.launch()

 # Load the model and tokenizer
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = AutoModelForVision2Seq.from_pretrained("stabilityai/japanese-stable-vlm", trust_remote_code=True, device_map='auto')
     return p
 # Define the function to generate text from the image and prompt
 def generate_text(image, task, input_text=None):
     prompt = build_prompt(task=task, input=input_text)
     inputs = processor(images=image, return_tensors="pt")
     return generated_text
 # Define the Gradio interface
+with gr.Blocks() as demo:
+    chatbot = gr.Chatbot([], elem_id="chatbot", show_copy_button=True)
+    with gr.Box():
+        with gr.Row():
+            image_input = gr.Image(label="Upload an image")
+            task_input = gr.Radio(choices=["caption", "tag", "vqa"], value="caption", label="Select a task")
+        text_input = gr.Textbox(label="Enter text (for tag or vqa tasks)")
+        submit_btn = gr.Button("Submit")
+    inputs = [image_input, task_input, text_input]
+    outputs = chatbot
+    submit_btn.click(generate_text, inputs, outputs, api_name="generate_text")
+    # Event listeners
+    chatbot.change(lambda x: print(f"Chatbot changed: {x}"), chatbot, chatbot)
+    chatbot.select(lambda x: print(f"Chatbot selected: {x.value}, {x.selected}"), None, chatbot)
+    chatbot.like(lambda x: print(f"Liked/Disliked: {x.index}, {x.value}, {x.liked}"), None, chatbot)
+demo.launch()