Spaces:

kaysrubio
/

speech_transcribe_phonemes_and_accent

Running

speech_transcribe_phonemes_and_accent / app.py

DontFreakOut

Updated logic

f847932 about 2 months ago

5.4 kB

	import gradio as gr
	import os
	import subprocess

	try:
	from transformers import pipeline
	except ModuleNotFoundError:
	print("Installing transformers...")
	subprocess.check_call(["pip", "install", "transformers"])
	from transformers import pipeline # Retry import

	import torch
	import torchaudio
	from speechbrain.pretrained import EncoderClassifier

	# Set up pipe for whisper asr
	asr_pipe = pipeline(
	"automatic-speech-recognition",
	model="openai/whisper-base.en",
	torch_dtype=torch.float32,
	device="cpu",
	)

	# Set up pipe for 2 phonemic transcription models
	american_phoneme_pipe = pipeline("automatic-speech-recognition", model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
	esl_phoneme_pipe = pipeline("automatic-speech-recognition", model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme")

	# Set up pipe for 2 accent classification models
	classifier = EncoderClassifier.from_hparams(source="Jzuluaga/accent-id-commonaccent_ecapa", savedir="pretrained_models/accent-id-commonaccent_ecapa")

	def native_accent_classifier(file):
	out_prob, score, index, text_lab = classifier.classify_file(file)
	rounded_score = round(score.item(), 2)
	return [{'accent': text_lab[0], 'score': rounded_score}]

	def esl_accent_classifier(file):
	esl_accent_pipe = pipeline(
	"audio-classification",
	model="kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2"
	)
	audio, sr = torchaudio.load(file) # Load audio
	audio = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(audio)
	audio = audio.squeeze().numpy()
	result = esl_accent_pipe(audio, top_k=6)
	return [{'accent': result[0]['label'], 'score': round(result[0]['score'],2)}]

	def transcribe_and_classify_speech(file):
	try:
	asr_output = asr_pipe(
	file,
	max_new_tokens=256,
	chunk_length_s=30,
	batch_size=8,
	)["text"]
	except Exception as e:
	print(f"An error occurred with openai/whisper-base.en: {e}")
	asr_output = "Error, make sure your file is in mono format"

	try:
	american_phoneme_output = american_phoneme_pipe(file)['text']
	except Exception as e:
	print(f"An error occurred with wav2vec2-xls-r-300m-timit-phoneme: {e}")
	american_phoneme_output = "Error, make sure your file is in mono format"

	try:
	esl_phoneme_output = esl_phoneme_pipe(file)['text']
	except Exception as e:
	print(f"An error occurred with mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme: {e}")
	esl_phoneme_output = "Error"

	try:
	native_accent_output = native_accent_classifier(file)
	except Exception as e:
	print(f"An error occurred with Jzuluaga/accent-id-commonaccent_ecapa: {e}")
	native_accent_output = [{'accent': "Error"}, {'score': .0}]

	try:
	esl_accent_output = esl_accent_classifier(file)
	except Exception as e:
	print(f"An error occurred with kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2: {e}")
	esl_accent_output = [{'accent': 'Unknown-please upload single channel audio'}, {'score': .0}]

	output = [
	{'transcription': asr_output},
	{'phonemes_native_eng': american_phoneme_output},
	{'phonemes_eng_second_lang': esl_phoneme_output},
	{'native_eng_country': native_accent_output},
	{'first_lang_if_not_eng': esl_accent_output}
	]
	return output

	## Set up gradio app
	demo = gr.Blocks()

	examples = [['chinese-american.wav'], ['mexican.wav'], ['vietnamese.wav'], ['indian.wav'], ['nigerian.wav'], ['irish.wav']]

	# Create a function to generate a vertically stacked interface
	def create_transcription_interface(source):
	with gr.Blocks() as interface:
	gr.Markdown("""
	Use microphone, upload .wav file, or choose an example below. Output will include results from the following models:
	- Transcription from OpenAI's Whisper [openai/whisper-base.en](https://huggingface.co/openai/whisper-base.en)
	- Phonemic transcription trained on native English speakers [vitouphy/wav2vec2-xls-r-300m-timit-phoneme](https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme)
	- Phonemic transcription trained on speakers of English as a second language [mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme](https://huggingface.co/mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme)
	- Accent classification trained on native English speakers [Jzuluaga/accent-id-commonaccent_ecapa](https://huggingface.co/Jzuluaga/accent-id-commonaccent_ecapa)
	- Accent classification trained on speakers of English as a second language [kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2](https://huggingface.co/kaysrubio/accent-id-distilhubert-finetuned-l2-arctic2)
	""")
	with gr.Column():
	audio_input = gr.Audio(sources=source, type="filepath", label="Upload Audio")
	output = gr.JSON(label="Results")
	audio_input.change(fn=transcribe_and_classify_speech, inputs=audio_input, outputs=output)
	gr.Examples(examples=examples, inputs=[audio_input])
	return interface

	# Create two interfaces (one for mic, one for file upload)
	mic_transcribe = create_transcription_interface("microphone")
	file_transcribe = create_transcription_interface("upload")

	demo = gr.TabbedInterface(
	[mic_transcribe, file_transcribe],
	["Microphone Input", "Upload .wav file"],
	title="Speech Recognition and Accent Classification",
	)

	demo.launch()
	# demo.launch(debug=True)