DontFreakOut
Updated logic
f847932
import gradio as gr
import os
import subprocess
try:
from transformers import pipeline
except ModuleNotFoundError:
print("Installing transformers...")
subprocess.check_call(["pip", "install", "transformers"])
from transformers import pipeline # Retry import
import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier
# Set up pipe for whisper asr
asr_pipe = pipeline(
"automatic-speech-recognition",
model="openai/whisper-base.en",
torch_dtype=torch.float32,
device="cpu",
)
# Set up pipe for 2 phonemic transcription models
american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme")
# Set up pipe for 2 accent classification models
classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")
def native_accent_classifier(file):
out_prob, score, index, text_lab = classifier.classify_file(file)
rounded_score = round(score.item(), 2)
return [{'accent': text_lab[0], 'score': rounded_score}]
def esl_accent_classifier(file):
esl_accent_pipe = pipeline(
"audio-classification",
model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
)
audio, sr = torchaudio.load(file) # Load audio
audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
audio = audio.squeeze().numpy()
result = esl_accent_pipe(audio, top_k=6)
return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]
def transcribe_and_classify_speech(file):
try:
asr_output = asr_pipe(
file,
max_new_tokens=256,
chunk_length_s=30,
batch_size=8,
)["text"]
except Exception as e:
print(f"An error occurred with openai/whisper-base.en: {e}")
asr_output = "Error, make sure your file is in mono format"
try:
american_phoneme_output = american_phoneme_pipe(file)['text']
except Exception as e:
print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
american_phoneme_output = "Error, make sure your file is in mono format"
try:
esl_phoneme_output = esl_phoneme_pipe(file)['text']
except Exception as e:
print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
esl_phoneme_output = "Error"
try:
native_accent_output = native_accent_classifier(file)
except Exception as e:
print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}")
native_accent_output = [{'accent': "Error"}, {'score': .0}]
try:
esl_accent_output = esl_accent_classifier(file)
except Exception as e:
print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}")
esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}]
output = [
{'transcription': asr_output},
{'phonemes_native_eng': american_phoneme_output},
{'phonemes_eng_second_lang': esl_phoneme_output},
{'native_eng_country': native_accent_output},
{'first_lang_if_not_eng': esl_accent_output}
]
return output
## Set up gradio app
demo = gr.Blocks()
examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']]
# Create a function to generate a vertically stacked interface
def create_transcription_interface(source):
with gr.Blocks() as interface:
gr.Markdown("""
Use microphone, upload .wav file, or choose an example below. Output will include results from the following models:
- Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
- Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
- Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
- Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co/Jzuluaga/accent-id-commonaccent_ecapa)
- Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co/kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2)
""")
with gr.Column():
audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
output = gr.JSON(label="Results")
audio_input.change(fn=transcribe_and_classify_speech, inputs=audio_input, outputs=output)
gr.Examples(examples=examples, inputs=[audio_input])
return interface
# Create two interfaces (one for mic, one for file upload)
mic_transcribe = create_transcription_interface("microphone")
file_transcribe = create_transcription_interface("upload")
demo = gr.TabbedInterface(
[mic_transcribe, file_transcribe],
["Microphone Input", "Upload .wav file"],
title="Speech Recognition and Accent Classification",
)
demo.launch()
# demo.launch(debug=True)