import spaces import torch import gradio as gr from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel import soundfile as sf import numpy as np from bark import SAMPLE_RATE, generate_audio, preload_models import torch.multiprocessing as mp # Import multiprocessing import os # Load Whisper and Vicuna models (as before) ASR_MODEL_NAME = "openai/whisper-medium.en" asr_pipe = pipeline( task="automatic-speech-recognition", model=ASR_MODEL_NAME, chunk_length_s=30, device='cuda', ) all_special_ids = asr_pipe.tokenizer.all_special_ids transcribe_token_id = all_special_ids[-5] translate_token_id = all_special_ids[-6] def _preload_and_load_models(): global vicuna_tokenizer, vicuna_model # Load Vicuna (as before) VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME) vicuna_model = AutoModelForCausalLM.from_pretrained( VICUNA_MODEL_NAME, torch_dtype=torch.float16, device_map="auto", # or.to('cuda') ).to('cuda') # Explicitly move to CUDA after loading # Bark model loading (modified) from bark.models import ( BARK_V0_MODEL_NAMES, BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME, ) # Import model names from bark.generation import preload_models as _preload_models # rename the function _preload_models(BARK_V0_MODEL_NAMES + [BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME]) # load models if __name__ == "__main__": if "HF_SPACE_ID" in os.environ: mp.set_start_method('spawn', force=True) p = mp.Process(target=_preload_and_load_models) p.start() p.join() else: _preload_and_load_models() @spaces.GPU(required=True) def process_audio(microphone, state, task="transcribe"): if microphone is None: return state, state, None asr_pipe.model.config.forced_decoder_ids = [ [2, transcribe_token_id if task == "transcribe" else translate_token_id] ] text = asr_pipe(microphone)["text"] system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9). You answer questions clearly and simply, using age-appropriate language. You are also a little bit silly and like to make jokes.""" prompt = f"{system_prompt}\nUser: {text}" with torch.no_grad(): vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda') vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192) vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True) vicuna_response = vicuna_response.replace(prompt, "").strip() updated_state = state + "\n" + vicuna_response try: # Use Bark's generate_audio function directly audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed # Scale and convert audio (as before) audio_arr = (audio_arr * 32767).astype(np.int16) # Save audio for debugging sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE) audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE except Exception as e: print(f"Error in speech synthesis: {e}") audio_output = None return updated_state, updated_state, audio_output with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo: gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark") gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!") with gr.Tab("Transcribe & Synthesize"): mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here") transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response") audio_output = gr.Audio(label="Synthesized Speech", type="numpy") transcription_state = gr.State(value="") mic_input.change( fn=process_audio, # Call the combined function inputs=[mic_input, transcription_state], outputs=[transcription_output, transcription_state, audio_output] ) demo.launch(share=False)