Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
import spaces
|
2 |
import torch
|
3 |
import gradio as gr
|
4 |
-
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
|
5 |
-
import soundfile as sf
|
6 |
import numpy as np
|
7 |
-
import
|
8 |
-
import os
|
9 |
|
|
|
10 |
ASR_MODEL_NAME = "openai/whisper-medium.en"
|
11 |
asr_pipe = pipeline(
|
12 |
task="automatic-speech-recognition",
|
@@ -19,10 +19,6 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
|
|
19 |
transcribe_token_id = all_special_ids[-5]
|
20 |
translate_token_id = all_special_ids[-6]
|
21 |
|
22 |
-
TTS_MODEL_NAME = "suno/bark"
|
23 |
-
tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
|
24 |
-
tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to('cuda')
|
25 |
-
|
26 |
VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
|
27 |
vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
|
28 |
vicuna_model = AutoModelForCausalLM.from_pretrained(
|
@@ -31,10 +27,14 @@ vicuna_model = AutoModelForCausalLM.from_pretrained(
|
|
31 |
device_map="auto",
|
32 |
)
|
33 |
|
|
|
|
|
|
|
34 |
@spaces.GPU(required=True)
|
35 |
def process_audio(microphone, state, task="transcribe"):
|
36 |
if microphone is None:
|
37 |
return state, state, None
|
|
|
38 |
asr_pipe.model.config.forced_decoder_ids = [
|
39 |
[2, transcribe_token_id if task == "transcribe" else translate_token_id]
|
40 |
]
|
@@ -43,21 +43,30 @@ def process_audio(microphone, state, task="transcribe"):
|
|
43 |
You answer questions clearly and simply, using age-appropriate language.
|
44 |
You are also a little bit silly and like to make jokes."""
|
45 |
prompt = f"{system_prompt}\nUser: {text}"
|
|
|
46 |
with torch.no_grad():
|
47 |
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
|
48 |
vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
|
49 |
-
vicuna_response = vicuna_tokenizer.decode(vicuna_output
|
50 |
vicuna_response = vicuna_response.replace(prompt, "").strip()
|
51 |
updated_state = state + "\n" + vicuna_response
|
|
|
52 |
try:
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
except Exception as e:
|
59 |
print(f"Error in speech synthesis: {e}")
|
60 |
audio_output = None
|
|
|
61 |
return updated_state, updated_state, audio_output
|
62 |
|
63 |
with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
|
|
|
1 |
import spaces
|
2 |
import torch
|
3 |
import gradio as gr
|
4 |
+
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel # Removed AutoProcessor
|
5 |
+
import soundfile as sf # For saving audio (debugging)
|
6 |
import numpy as np
|
7 |
+
from bark import SAMPLE_RATE, generate_audio, preload_models # Import Bark functions
|
|
|
8 |
|
9 |
+
# Load Whisper and Vicuna models (as before)
|
10 |
ASR_MODEL_NAME = "openai/whisper-medium.en"
|
11 |
asr_pipe = pipeline(
|
12 |
task="automatic-speech-recognition",
|
|
|
19 |
transcribe_token_id = all_special_ids[-5]
|
20 |
translate_token_id = all_special_ids[-6]
|
21 |
|
|
|
|
|
|
|
|
|
22 |
VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
|
23 |
vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
|
24 |
vicuna_model = AutoModelForCausalLM.from_pretrained(
|
|
|
27 |
device_map="auto",
|
28 |
)
|
29 |
|
30 |
+
# Preload Bark models (crucial for efficiency)
|
31 |
+
preload_models() # No need for the DEBUG_MODE check here; preload always
|
32 |
+
|
33 |
@spaces.GPU(required=True)
|
34 |
def process_audio(microphone, state, task="transcribe"):
|
35 |
if microphone is None:
|
36 |
return state, state, None
|
37 |
+
|
38 |
asr_pipe.model.config.forced_decoder_ids = [
|
39 |
[2, transcribe_token_id if task == "transcribe" else translate_token_id]
|
40 |
]
|
|
|
43 |
You answer questions clearly and simply, using age-appropriate language.
|
44 |
You are also a little bit silly and like to make jokes."""
|
45 |
prompt = f"{system_prompt}\nUser: {text}"
|
46 |
+
|
47 |
with torch.no_grad():
|
48 |
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
|
49 |
vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
|
50 |
+
vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
|
51 |
vicuna_response = vicuna_response.replace(prompt, "").strip()
|
52 |
updated_state = state + "\n" + vicuna_response
|
53 |
+
|
54 |
try:
|
55 |
+
# Use Bark's generate_audio function directly
|
56 |
+
audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
|
57 |
+
|
58 |
+
# Scale and convert audio (as before)
|
59 |
+
audio_arr = (audio_arr * 32767).astype(np.int16)
|
60 |
+
|
61 |
+
# Save audio for debugging
|
62 |
+
sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
|
63 |
+
|
64 |
+
audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
|
65 |
+
|
66 |
except Exception as e:
|
67 |
print(f"Error in speech synthesis: {e}")
|
68 |
audio_output = None
|
69 |
+
|
70 |
return updated_state, updated_state, audio_output
|
71 |
|
72 |
with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
|