Spaces:
Runtime error
Runtime error
Commit
·
f464233
1
Parent(s):
a7a18e3
record realtime
Browse files
app.py
CHANGED
@@ -181,6 +181,41 @@ def transcribe_en(audio, state_en="", state_vi=""):
|
|
181 |
state_vi += vi_text + " "
|
182 |
return state_en, state_vi
|
183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
184 |
"""Gradio demo"""
|
185 |
|
186 |
vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
|
@@ -255,13 +290,28 @@ with gr.Blocks() as demo:
|
|
255 |
inputs=[en_audio_1])
|
256 |
|
257 |
with gr.TabItem("En-Vi Realtime Translation"):
|
258 |
-
with gr.Row():
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
265 |
|
266 |
if __name__ == "__main__":
|
267 |
demo.launch()
|
|
|
181 |
state_vi += vi_text + " "
|
182 |
return state_en, state_vi
|
183 |
|
184 |
+
def transcribe_vi_rm(audio, state_vi="", state_en=""):
|
185 |
+
ds = speech_file_to_array_fn(audio.name)
|
186 |
+
# infer model
|
187 |
+
input_values = processor(
|
188 |
+
ds["speech"],
|
189 |
+
sampling_rate=ds["sampling_rate"],
|
190 |
+
return_tensors="pt"
|
191 |
+
).input_values
|
192 |
+
# decode ctc output
|
193 |
+
logits = vi_model(input_values).logits[0]
|
194 |
+
pred_ids = torch.argmax(logits, dim=-1)
|
195 |
+
greedy_search_output = processor.decode(pred_ids)
|
196 |
+
beam_search_output = ngram_lm_model.decode(logits.cpu().detach().numpy(), beam_width=500)
|
197 |
+
state_vi += beam_search_output + " "
|
198 |
+
en_text = translate_vi2en(beam_search_output)
|
199 |
+
state_en += en_text + " "
|
200 |
+
return state_vi, state_en, state_vi, state_en
|
201 |
+
|
202 |
+
def transcribe_en_rm(audio, state_en="", state_vi=""):
|
203 |
+
speech = load_data(audio)
|
204 |
+
# Tokenize
|
205 |
+
input_values = eng_tokenizer(speech, return_tensors="pt").input_values
|
206 |
+
# Take logits
|
207 |
+
logits = eng_model(input_values).logits
|
208 |
+
# Take argmax
|
209 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
210 |
+
# Get the words from predicted word ids
|
211 |
+
transcription = eng_tokenizer.decode(predicted_ids[0])
|
212 |
+
# Output is all upper case
|
213 |
+
transcription = correct_casing(transcription.lower())
|
214 |
+
state_en += transcription + " "
|
215 |
+
vi_text = translate_en2vi(transcription)
|
216 |
+
state_vi += vi_text + " "
|
217 |
+
return state_en, state_vi, state_en, state_vi
|
218 |
+
|
219 |
"""Gradio demo"""
|
220 |
|
221 |
vi_example_text = ["Có phải bạn đang muốn tìm mua nhà ở ngoại ô thành phố Hồ Chí Minh không?",
|
|
|
290 |
inputs=[en_audio_1])
|
291 |
|
292 |
with gr.TabItem("En-Vi Realtime Translation"):
|
293 |
+
# with gr.Row():
|
294 |
+
# with gr.Column():
|
295 |
+
# en_audio_2 = gr.Audio(source="microphone", label="Input English Audio", type="filepath", streaming=True)
|
296 |
+
# with gr.Column():
|
297 |
+
# speech2text_en2 = gr.Textbox(label="English Text")
|
298 |
+
# vietnamese_out_3 = gr.Textbox(label="Vietnamese Text")
|
299 |
+
# en_audio_2.change(transcribe_en, [en_audio_2, speech2text_en2, vietnamese_out_3], [speech2text_en2, vietnamese_out_3])
|
300 |
+
|
301 |
+
gr.Interface(
|
302 |
+
fn=transcribe_en_rm,
|
303 |
+
inputs=[
|
304 |
+
gr.Audio(source="microphone", type="filepath", streaming=True),
|
305 |
+
"state_en",
|
306 |
+
"state_vi"
|
307 |
+
],
|
308 |
+
outputs=[
|
309 |
+
"textbox_en",
|
310 |
+
"textbox_vi"
|
311 |
+
"state_en"
|
312 |
+
"state_vi"
|
313 |
+
],
|
314 |
+
live=True).launch()
|
315 |
|
316 |
if __name__ == "__main__":
|
317 |
demo.launch()
|