ford442 commited on
Commit
34f0437
·
verified ·
1 Parent(s): f835a2f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -7
app.py CHANGED
@@ -48,11 +48,19 @@ _preload_and_load_models()
48
  tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
49
 
50
  @spaces.GPU(required=True)
51
- def process_audio(microphone, state, answer_mode):
52
- if microphone is None:
53
- return state, state, None
54
- asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
55
- text = asr_pipe(microphone)["text"]
 
 
 
 
 
 
 
 
56
  system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
57
  You answer questions clearly and simply, using age-appropriate language.
58
  You are also a little bit silly and like to make jokes."""
@@ -105,14 +113,22 @@ with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title
105
  gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
106
  gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
107
  with gr.Tab("Transcribe & Synthesize"):
108
- mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
 
 
109
  transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
110
  audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
111
  answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
112
  transcription_state = gr.State(value="")
 
113
  mic_input.change(
114
  fn=process_audio,
115
- inputs=[mic_input, transcription_state, answer_mode],
 
 
 
 
 
116
  outputs=[transcription_output, transcription_state, audio_output]
117
  )
118
 
 
48
  tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
49
 
50
  @spaces.GPU(required=True)
51
+ def process_audio(microphone, audio_upload, state, answer_mode): # Added audio_upload
52
+ audio_source = None
53
+ if microphone:
54
+ audio_source = microphone
55
+ asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
56
+ text = asr_pipe(audio_source)["text"]
57
+ elif audio_upload:
58
+ audio_source = audio_upload
59
+ rate, data = scipy.io.wavfile.read(audio_source)
60
+ asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
61
+ text = asr_pipe(data)["text"]
62
+ else:
63
+ return state, state, None # No audio input
64
  system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
65
  You answer questions clearly and simply, using age-appropriate language.
66
  You are also a little bit silly and like to make jokes."""
 
113
  gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
114
  gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
115
  with gr.Tab("Transcribe & Synthesize"):
116
+ with gr.Row(): # Added a row for better layout
117
+ mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
118
+ audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
119
  transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
120
  audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
121
  answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
122
  transcription_state = gr.State(value="")
123
+
124
  mic_input.change(
125
  fn=process_audio,
126
+ inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
127
+ outputs=[transcription_output, transcription_state, audio_output]
128
+ )
129
+ audio_upload.change( # Added change event for upload
130
+ fn=process_audio,
131
+ inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
132
  outputs=[transcription_output, transcription_state, audio_output]
133
  )
134