vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 12

Commit

9ddc7f6

verified ·

1 Parent(s): 69cfc54

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -11

app.py CHANGED Viewed

@@ -47,27 +47,40 @@ _preload_and_load_models()
 tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
 @spaces.GPU(required=True)
-def process_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state, None
-    asr_pipe.model.config.forced_decoder_ids = [
-        [2, transcribe_token_id if task == "transcribe" else translate_token_id]
-    ]
     text = asr_pipe(microphone)["text"]
     system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
-    with torch.no_grad():
-        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
         vicuna_output = vicuna_model.generate(
             **vicuna_input,
-            max_length = 96,
             min_new_tokens = 64,
             do_sample = True
         )
-        vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
-        vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
     try:
         #with torch.no_grad():
@@ -94,11 +107,11 @@ with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo:  # Updated title
         mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
         transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
-        audio_output = gr.Radio(["transcribe", "translate"]
         transcription_state = gr.State(value="")
         mic_input.change(
             fn=process_audio,
-            inputs=[mic_input, transcription_state, , label="Task", value="transcribe")],
             outputs=[transcription_output, transcription_state, audio_output]
         )

 tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
 @spaces.GPU(required=True)
+def process_audio(microphone, state, answer_mode):
     if microphone is None:
         return state, state, None
+    asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
     text = asr_pipe(microphone)["text"]
     system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
+    #with torch.no_grad():
+    vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
+    if answer_mode == 'slow':
         vicuna_output = vicuna_model.generate(
             **vicuna_input,
+            max_length = 512,
+            min_new_tokens = 256,
+            do_sample = True
+        )
+    if answer_mode == 'medium':
+        vicuna_output = vicuna_model.generate(
+            **vicuna_input,
+            max_length = 128,
             min_new_tokens = 64,
             do_sample = True
         )
+    if answer_mode == 'fast':
+        vicuna_output = vicuna_model.generate(
+            **vicuna_input,
+            max_length = 42,
+            min_new_tokens = 16,
+            do_sample = True
+        )
+    vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
+    vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
     try:
         #with torch.no_grad():
         mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
         transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
+        answer_mode = gr.Radio(["fast", "medium", "slow"]
         transcription_state = gr.State(value="")
         mic_input.change(
             fn=process_audio,
+            inputs=[mic_input, transcription_state, answer_mode)],
             outputs=[transcription_output, transcription_state, audio_output]
         )