vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 12

Commit

34f0437

verified ·

1 Parent(s): f835a2f

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -7

app.py CHANGED Viewed

@@ -48,11 +48,19 @@ _preload_and_load_models()
 tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
 @spaces.GPU(required=True)
-def process_audio(microphone, state, answer_mode):
-    if microphone is None:
-        return state, state, None
-    asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
-    text = asr_pipe(microphone)["text"]
     system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
@@ -105,14 +113,22 @@ with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo:  # Updated title
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
     with gr.Tab("Transcribe & Synthesize"):
-        mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
         transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
         answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
         transcription_state = gr.State(value="")
         mic_input.change(
             fn=process_audio,
-            inputs=[mic_input, transcription_state, answer_mode],
             outputs=[transcription_output, transcription_state, audio_output]
         )

 tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
 @spaces.GPU(required=True)
+def process_audio(microphone, audio_upload, state, answer_mode):  # Added audio_upload
+    audio_source = None
+    if microphone:
+        audio_source = microphone
+        asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
+        text = asr_pipe(audio_source)["text"]
+    elif audio_upload:
+        audio_source = audio_upload
+        rate, data = scipy.io.wavfile.read(audio_source)
+        asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
+        text = asr_pipe(data)["text"]
+    else:
+        return state, state, None  # No audio input
     system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
     with gr.Tab("Transcribe & Synthesize"):
+        with gr.Row(): # Added a row for better layout
+            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
+            audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
         transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
         answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
         transcription_state = gr.State(value="")
         mic_input.change(
             fn=process_audio,
+            inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
+            outputs=[transcription_output, transcription_state, audio_output]
+        )
+        audio_upload.change( # Added change event for upload
+            fn=process_audio,
+            inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
             outputs=[transcription_output, transcription_state, audio_output]
         )