Spaces:

datnth1709
/

FantasticFour-S2T-MT-demo

Runtime error

App Files Files Community

datnth1709 commited on Sep 21, 2022

Commit

f464233

1 Parent(s): a7a18e3

record realtime

Browse files

Files changed (1) hide show

app.py +57 -7

app.py CHANGED Viewed

@@ -181,6 +181,41 @@ def transcribe_en(audio, state_en="", state_vi=""):
     state_vi += vi_text + " "
     return state_en, state_vi
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
@@ -255,13 +290,28 @@ with gr.Blocks() as demo:
                         inputs=[en_audio_1])
         with gr.TabItem("En-Vi Realtime Translation"):
-            with gr.Row():
-                with gr.Column():
-                    en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
-                with gr.Column():
-                    speech2text_en2 = gr.Textbox(label="English Text")
-                    vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
-            en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
 if __name__ == "__main__":
     demo.launch()

     state_vi += vi_text + " "
     return state_en, state_vi
+def transcribe_vi_rm(audio, state_vi="", state_en=""):
+    ds = speech_file_to_array_fn(audio.name)
+    # infer model
+    input_values = processor(
+          ds["speech"],
+          sampling_rate=ds["sampling_rate"],
+          return_tensors="pt"
+    ).input_values
+    # decode ctc output
+    logits = vi_model(input_values).logits[0]
+    pred_ids = torch.argmax(logits, dim=-1)
+    greedy_search_output = processor.decode(pred_ids)
+    beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
+    state_vi += beam_search_output + " "
+    en_text = translate_vi2en(beam_search_output)
+    state_en += en_text + " "
+    return state_vi, state_en, state_vi, state_en
+def transcribe_en_rm(audio, state_en="", state_vi=""):
+    speech = load_data(audio)
+    # Tokenize
+    input_values = eng_tokenizer(speech, return_tensors="pt").input_values
+    # Take logits
+    logits = eng_model(input_values).logits
+    # Take argmax
+    predicted_ids = torch.argmax(logits, dim=-1)
+    # Get the words from predicted word ids
+    transcription = eng_tokenizer.decode(predicted_ids[0])
+    # Output is all upper case
+    transcription = correct_casing(transcription.lower())
+    state_en += transcription + " "
+    vi_text = translate_en2vi(transcription)
+    state_vi += vi_text + " "
+    return state_en, state_vi, state_en, state_vi
 """Gradio demo"""
 vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
                         inputs=[en_audio_1])
         with gr.TabItem("En-Vi Realtime Translation"):
+            # with gr.Row():
+            #     with gr.Column():
+            #         en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
+            #     with gr.Column():
+            #         speech2text_en2 = gr.Textbox(label="English Text")
+            #         vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
+            # en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
+            gr.Interface(
+                fn=transcribe_en_rm,
+                inputs=[
+                    gr.Audio(source="microphone", type="filepath", streaming=True),
+                    "state_en",
+                    "state_vi"
+                ],
+                outputs=[
+                    "textbox_en",
+                    "textbox_vi"
+                    "state_en"
+                    "state_vi"
+                ],
+                live=True).launch()
 if __name__ == "__main__":
     demo.launch()