Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,383 Bytes
9f8fb3c 29b4682 1323ad0 b1622fb 1323ad0 b1622fb e0628e1 1323ad0 3c50a05 a736521 f4d388e c249a04 f4d388e 1323ad0 06fb866 b1622fb 06fb866 b1622fb 06fb866 e0628e1 06fb866 b1622fb 06fb866 b1622fb 06fb866 b1622fb 06fb866 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 |
import spaces
import torch
import gradio as gr
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
import soundfile as sf
import numpy as np
from bark import SAMPLE_RATE, generate_audio, preload_models
import torch.multiprocessing as mp # Import multiprocessing
import os
# Load Whisper and Vicuna models (as before)
ASR_MODEL_NAME = "openai/whisper-medium.en"
asr_pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL_NAME,
chunk_length_s=30,
device='cuda',
)
all_special_ids = asr_pipe.tokenizer.all_special_ids
transcribe_token_id = all_special_ids[-5]
translate_token_id = all_special_ids[-6]
def _preload_and_load_models():
global vicuna_tokenizer, vicuna_model
# Load Vicuna (as before)
VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
vicuna_model = AutoModelForCausalLM.from_pretrained(
VICUNA_MODEL_NAME,
torch_dtype=torch.float16,
device_map="auto", # or.to('cuda')
).to('cuda') # Explicitly move to CUDA after loading
# Bark model loading (modified)
from bark.models import (
BARK_V0_MODEL_NAMES,
BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME,
) # Import model names
from bark.generation import preload_models as _preload_models # rename the function
_preload_models(BARK_V0_MODEL_NAMES + [BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME]) # load models
if __name__ == "__main__":
if "HF_SPACE_ID" in os.environ:
mp.set_start_method('spawn', force=True)
p = mp.Process(target=_preload_and_load_models)
p.start()
p.join()
else:
_preload_and_load_models()
@spaces.GPU(required=True)
def process_audio(microphone, state, task="transcribe"):
if microphone is None:
return state, state, None
asr_pipe.model.config.forced_decoder_ids = [
[2, transcribe_token_id if task == "transcribe" else translate_token_id]
]
text = asr_pipe(microphone)["text"]
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
You answer questions clearly and simply, using age-appropriate language.
You are also a little bit silly and like to make jokes."""
prompt = f"{system_prompt}\nUser: {text}"
with torch.no_grad():
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
vicuna_response = vicuna_response.replace(prompt, "").strip()
updated_state = state + "\n" + vicuna_response
try:
# Use Bark's generate_audio function directly
audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
# Scale and convert audio (as before)
audio_arr = (audio_arr * 32767).astype(np.int16)
# Save audio for debugging
sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
except Exception as e:
print(f"Error in speech synthesis: {e}")
audio_output = None
return updated_state, updated_state, audio_output
with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
with gr.Tab("Transcribe & Synthesize"):
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
transcription_state = gr.State(value="")
mic_input.change(
fn=process_audio, # Call the combined function
inputs=[mic_input, transcription_state],
outputs=[transcription_output, transcription_state, audio_output]
)
demo.launch(share=False) |