vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 11

Commit

a736521

verified ·

1 Parent(s): f055d9c

Update app.py

Browse files

Files changed (1) hide show

app.py +4 -10

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ import numpy as np
 import IPython.display as ipd
 import os
-ASR_MODEL_NAME = "openai/whisper-large-v2"
 asr_pipe = pipeline(
     task="automatic-speech-recognition",
     model=ASR_MODEL_NAME,
@@ -19,7 +19,7 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
-TTS_MODEL_NAME = "suno/bark-small"
 tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
 tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to('cuda')
@@ -27,7 +27,7 @@ VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     VICUNA_MODEL_NAME,
-    torch_dtype=torch.float16,
     device_map="auto",
 )
@@ -35,7 +35,6 @@ vicuna_model = AutoModelForCausalLM.from_pretrained(
 def process_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state, None
     asr_pipe.model.config.forced_decoder_ids = [
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
@@ -44,14 +43,12 @@ def process_audio(microphone, state, task="transcribe"):
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
-        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     try:
         with torch.no_grad():
             inputs = tts_processor(vicuna_response, return_tensors="pt").to('cuda')
@@ -61,11 +58,8 @@ def process_audio(microphone, state, task="transcribe"):
     except Exception as e:
         print(f"Error in speech synthesis: {e}")
         audio_output = None
     return updated_state, updated_state, audio_output
 with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")

 import IPython.display as ipd
 import os
+ASR_MODEL_NAME = "openai/whisper-medium.en"
 asr_pipe = pipeline(
     task="automatic-speech-recognition",
     model=ASR_MODEL_NAME,
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
+TTS_MODEL_NAME = "suno/bark"
 tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
 tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to('cuda')
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     VICUNA_MODEL_NAME,
+    torch_dtype=torch.bfloat16,
     device_map="auto",
 )
 def process_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state, None
     asr_pipe.model.config.forced_decoder_ids = [
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
+        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=256)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     try:
         with torch.no_grad():
             inputs = tts_processor(vicuna_response, return_tensors="pt").to('cuda')
     except Exception as e:
         print(f"Error in speech synthesis: {e}")
         audio_output = None
     return updated_state, updated_state, audio_output
 with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")