vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 11

Commit

9f8fb3c

verified ·

1 Parent(s): cc30e2e

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -39

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-#import spaces
 import torch
 import gradio as gr
 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoProcessor
@@ -7,60 +7,65 @@ import numpy as np
 import IPython.display as ipd
 import os
-ASR_MODEL_NAME = "openai/whisper-large-v2"
-asr_pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=ASR_MODEL_NAME,
-    chunk_length_s=30,
-    device='cuda',
-)
-all_special_ids = asr_pipe.tokenizer.all_special_ids
-transcribe_token_id = all_special_ids[-5]
-translate_token_id = all_special_ids[-6]
-TTS_MODEL_NAME = "suno/bark-small"
-tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
-tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to('cuda')
-VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
-vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
-vicuna_model = AutoModelForCausalLM.from_pretrained(
-    VICUNA_MODEL_NAME,
-    torch_dtype=torch.float16,  # Use float16 for efficiency (if GPU supports it)
-    device_map="auto",  # Let transformers handle device placement
-) #.to('cuda')
-def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
-        return state, state
     asr_pipe.model.config.forced_decoder_ids = [
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
     text = asr_pipe(microphone)["text"]
     system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
-      You answer questions clearly and simply, using age-appropriate language.
-      You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
-        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
         vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
-    return updated_state, updated_state
-def synthesize_speech(text):
     try:
         with torch.no_grad():
-            inputs = tts_processor(text, return_tensors="pt").to('cuda')
-            output = tts_model.generate(**inputs, do_sample=True) #Bark generate
         waveform_np = output[0].cpu().numpy()
-        return (tts_model.generation_config.sample_rate, waveform_np) #Bark sample rate
     except Exception as e:
-        print(e)
-        return (None, None)
 with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
@@ -71,13 +76,9 @@ with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
         transcription_state = gr.State(value="")
         mic_input.change(
-            fn=transcribe_audio,
             inputs=[mic_input, transcription_state],
-            outputs=[transcription_output, transcription_state]
-        ).then(
-            fn=synthesize_speech,
-            inputs=transcription_output,
-            outputs=audio_output
         )
 demo.launch(share=False)

+import spaces
 import torch
 import gradio as gr
 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoProcessor
 import IPython.display as ipd
 import os
+# Define a decorator for GPU usage in Spaces
+@spaces.GPU(required=True)  # This decorator ensures GPU availability
+def process_audio(microphone, state, task="transcribe"):
+    ASR_MODEL_NAME = "openai/whisper-large-v2"
+    asr_pipe = pipeline(
+        task="automatic-speech-recognition",
+        model=ASR_MODEL_NAME,
+        chunk_length_s=30,
+        device='cuda', # Explicitly set device to 'cuda' within the function
+    )
+    all_special_ids = asr_pipe.tokenizer.all_special_ids
+    transcribe_token_id = all_special_ids[-5]
+    translate_token_id = all_special_ids[-6]
+    TTS_MODEL_NAME = "suno/bark-small"
+    tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
+    tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to('cuda') # Explicitly set device to 'cuda' within the function
+    VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
+    vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
+    vicuna_model = AutoModelForCausalLM.from_pretrained(
+        VICUNA_MODEL_NAME,
+        torch_dtype=torch.float16,  # Use float16 for efficiency (if GPU supports it)
+        device_map="auto",  # Let transformers handle device placement
+    ) #.to('cuda')
     if microphone is None:
+        return state, state, None  # Return None for audio if no microphone input
     asr_pipe.model.config.forced_decoder_ids = [
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
     text = asr_pipe(microphone)["text"]
     system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
+        You answer questions clearly and simply, using age-appropriate language.
+        You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
+        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda') # Explicitly set device to 'cuda' within the function
         vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     try:
         with torch.no_grad():
+            inputs = tts_processor(vicuna_response, return_tensors="pt").to('cuda') # Explicitly set device to 'cuda' within the function
+            output = tts_model.generate(**inputs, do_sample=True)  # Bark generate
         waveform_np = output[0].cpu().numpy()
+        audio_output = (tts_model.generation_config.sample_rate, waveform_np)  # Bark sample rate
     except Exception as e:
+        print(f"Error in speech synthesis: {e}")
+        audio_output = None
+    return updated_state, updated_state, audio_output
 with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
         transcription_state = gr.State(value="")
         mic_input.change(
+            fn=process_audio,  # Call the combined function
             inputs=[mic_input, transcription_state],
+            outputs=[transcription_output, transcription_state, audio_output]
         )
 demo.launch(share=False)