datnth1709 commited on
Commit
f464233
·
1 Parent(s): a7a18e3

record realtime

Browse files
Files changed (1) hide show
  1. app.py +57 -7
app.py CHANGED
@@ -181,6 +181,41 @@ def transcribe_en(audio, state_en="", state_vi=""):
181
  state_vi += vi_text + " "
182
  return state_en, state_vi
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  """Gradio demo"""
185
 
186
  vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
@@ -255,13 +290,28 @@ with gr.Blocks() as demo:
255
  inputs=[en_audio_1])
256
 
257
  with gr.TabItem("En-Vi Realtime Translation"):
258
- with gr.Row():
259
- with gr.Column():
260
- en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
261
- with gr.Column():
262
- speech2text_en2 = gr.Textbox(label="English Text")
263
- vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
264
- en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
  if __name__ == "__main__":
267
  demo.launch()
 
181
  state_vi += vi_text + " "
182
  return state_en, state_vi
183
 
184
+ def transcribe_vi_rm(audio, state_vi="", state_en=""):
185
+ ds = speech_file_to_array_fn(audio.name)
186
+ # infer model
187
+ input_values = processor(
188
+ ds["speech"],
189
+ sampling_rate=ds["sampling_rate"],
190
+ return_tensors="pt"
191
+ ).input_values
192
+ # decode ctc output
193
+ logits = vi_model(input_values).logits[0]
194
+ pred_ids = torch.argmax(logits, dim=-1)
195
+ greedy_search_output = processor.decode(pred_ids)
196
+ beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
197
+ state_vi += beam_search_output + " "
198
+ en_text = translate_vi2en(beam_search_output)
199
+ state_en += en_text + " "
200
+ return state_vi, state_en, state_vi, state_en
201
+
202
+ def transcribe_en_rm(audio, state_en="", state_vi=""):
203
+ speech = load_data(audio)
204
+ # Tokenize
205
+ input_values = eng_tokenizer(speech, return_tensors="pt").input_values
206
+ # Take logits
207
+ logits = eng_model(input_values).logits
208
+ # Take argmax
209
+ predicted_ids = torch.argmax(logits, dim=-1)
210
+ # Get the words from predicted word ids
211
+ transcription = eng_tokenizer.decode(predicted_ids[0])
212
+ # Output is all upper case
213
+ transcription = correct_casing(transcription.lower())
214
+ state_en += transcription + " "
215
+ vi_text = translate_en2vi(transcription)
216
+ state_vi += vi_text + " "
217
+ return state_en, state_vi, state_en, state_vi
218
+
219
  """Gradio demo"""
220
 
221
  vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
 
290
  inputs=[en_audio_1])
291
 
292
  with gr.TabItem("En-Vi Realtime Translation"):
293
+ # with gr.Row():
294
+ # with gr.Column():
295
+ # en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
296
+ # with gr.Column():
297
+ # speech2text_en2 = gr.Textbox(label="English Text")
298
+ # vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
299
+ # en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
300
+
301
+ gr.Interface(
302
+ fn=transcribe_en_rm,
303
+ inputs=[
304
+ gr.Audio(source="microphone", type="filepath", streaming=True),
305
+ "state_en",
306
+ "state_vi"
307
+ ],
308
+ outputs=[
309
+ "textbox_en",
310
+ "textbox_vi"
311
+ "state_en"
312
+ "state_vi"
313
+ ],
314
+ live=True).launch()
315
 
316
  if __name__ == "__main__":
317
  demo.launch()