vicuna-clip

Running on Zero

App Files Files Community

ford442 commited on Feb 11

Commit

3c50a05

verified ·

1 Parent(s): aa64e74

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -14

app.py CHANGED Viewed

@@ -1,12 +1,12 @@
 import spaces
 import torch
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoProcessor
-import soundfile as sf
 import numpy as np
-import IPython.display as ipd
-import os
 ASR_MODEL_NAME = "openai/whisper-medium.en"
 asr_pipe = pipeline(
     task="automatic-speech-recognition",
@@ -19,10 +19,6 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
-TTS_MODEL_NAME = "suno/bark"
-tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
-tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to('cuda')
 VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
@@ -31,10 +27,14 @@ vicuna_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
 )
 @spaces.GPU(required=True)
 def process_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state, None
     asr_pipe.model.config.forced_decoder_ids = [
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
@@ -43,21 +43,30 @@ def process_audio(microphone, state, task="transcribe"):
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
         vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
-        vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     try:
-        with torch.no_grad():
-            inputs = tts_processor(vicuna_response, return_tensors="pt").to('cuda')
-            output = tts_model.generate(**inputs, do_sample=False)
-        waveform_np = output[0].cpu().numpy()
-        audio_output = (tts_model.generation_config.sample_rate, waveform_np)
     except Exception as e:
         print(f"Error in speech synthesis: {e}")
         audio_output = None
     return updated_state, updated_state, audio_output
 with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:

 import spaces
 import torch
 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel  # Removed AutoProcessor
+import soundfile as sf  # For saving audio (debugging)
 import numpy as np
+from bark import SAMPLE_RATE, generate_audio, preload_models  # Import Bark functions
+# Load Whisper and Vicuna models (as before)
 ASR_MODEL_NAME = "openai/whisper-medium.en"
 asr_pipe = pipeline(
     task="automatic-speech-recognition",
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
 VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
 )
+# Preload Bark models (crucial for efficiency)
+preload_models()  # No need for the DEBUG_MODE check here; preload always
 @spaces.GPU(required=True)
 def process_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state, None
     asr_pipe.model.config.forced_decoder_ids = [
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
         vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
+        vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     try:
+        # Use Bark's generate_audio function directly
+        audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
+        # Scale and convert audio (as before)
+        audio_arr = (audio_arr * 32767).astype(np.int16)
+        # Save audio for debugging
+        sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
+        audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE
     except Exception as e:
         print(f"Error in speech synthesis: {e}")
         audio_output = None
     return updated_state, updated_state, audio_output
 with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo: