|
import gradio as gr |
|
import os |
|
import subprocess |
|
|
|
try: |
|
from transformers import pipeline |
|
except ModuleNotFoundError: |
|
print("Installing transformers...") |
|
subprocess.check_call(["pip", "install", "transformers"]) |
|
from transformers import pipeline |
|
|
|
import torch |
|
import torchaudio |
|
from speechbrain.pretrained import EncoderClassifier |
|
|
|
|
|
asr_pipe = pipeline( |
|
"automatic-speech-recognition", |
|
model="openai/whisper-base.en", |
|
torch_dtype=torch.float32, |
|
device="cpu", |
|
) |
|
|
|
|
|
american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme") |
|
esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme") |
|
|
|
|
|
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa") |
|
|
|
def native_accent_classifier(file): |
|
out_prob, score, index, text_lab = classifier.classify_file(file) |
|
rounded_score = round(score.item(), 2) |
|
return [{'accent': text_lab[0], 'score': rounded_score}] |
|
|
|
def esl_accent_classifier(file): |
|
esl_accent_pipe = pipeline( |
|
"audio-classification", |
|
model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2" |
|
) |
|
audio, sr = torchaudio.load(file) |
|
audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio) |
|
audio = audio.squeeze().numpy() |
|
result = esl_accent_pipe(audio, top_k=6) |
|
return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}] |
|
|
|
def transcribe_and_classify_speech(file): |
|
try: |
|
asr_output = asr_pipe( |
|
file, |
|
max_new_tokens=256, |
|
chunk_length_s=30, |
|
batch_size=8, |
|
)["text"] |
|
except Exception as e: |
|
print(f"An error occurred with openai/whisper-base.en: {e}") |
|
asr_output = "Error, make sure your file is in mono format" |
|
|
|
try: |
|
american_phoneme_output = american_phoneme_pipe(file)['text'] |
|
except Exception as e: |
|
print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}") |
|
american_phoneme_output = "Error, make sure your file is in mono format" |
|
|
|
try: |
|
esl_phoneme_output = esl_phoneme_pipe(file)['text'] |
|
except Exception as e: |
|
print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}") |
|
esl_phoneme_output = "Error" |
|
|
|
try: |
|
native_accent_output = native_accent_classifier(file) |
|
except Exception as e: |
|
print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}") |
|
native_accent_output = [{'accent': "Error"}, {'score': .0}] |
|
|
|
try: |
|
esl_accent_output = esl_accent_classifier(file) |
|
except Exception as e: |
|
print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}") |
|
esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}] |
|
|
|
output = [ |
|
{'transcription': asr_output}, |
|
{'phonemes_native_eng': american_phoneme_output}, |
|
{'phonemes_eng_second_lang': esl_phoneme_output}, |
|
{'native_eng_country': native_accent_output}, |
|
{'first_lang_if_not_eng': esl_accent_output} |
|
] |
|
return output |
|
|
|
|
|
demo = gr.Blocks() |
|
|
|
examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']] |
|
|
|
|
|
def create_transcription_interface(source): |
|
with gr.Blocks() as interface: |
|
gr.Markdown(""" |
|
Use microphone, upload .wav file, or choose an example below. Output will include results from the following models: |
|
- Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en) |
|
- Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme) |
|
- Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme) |
|
- Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co/Jzuluaga/accent-id-commonaccent_ecapa) |
|
- Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co/kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2) |
|
""") |
|
with gr.Column(): |
|
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio") |
|
output = gr.JSON(label="Results") |
|
audio_input.change(fn=transcribe_and_classify_speech, inputs=audio_input, outputs=output) |
|
gr.Examples(examples=examples, inputs=[audio_input]) |
|
return interface |
|
|
|
|
|
mic_transcribe = create_transcription_interface("microphone") |
|
file_transcribe = create_transcription_interface("upload") |
|
|
|
demo = gr.TabbedInterface( |
|
[mic_transcribe, file_transcribe], |
|
["Microphone Input", "Upload .wav file"], |
|
title="Speech Recognition and Accent Classification", |
|
) |
|
|
|
demo.launch() |
|
|