Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,6 +6,7 @@ import soundfile as sf
|
|
6 |
import numpy as np
|
7 |
from bark import SAMPLE_RATE, generate_audio, preload_models
|
8 |
import torch.multiprocessing as mp # Import multiprocessing
|
|
|
9 |
|
10 |
# Load Whisper and Vicuna models (as before)
|
11 |
ASR_MODEL_NAME = "openai/whisper-medium.en"
|
@@ -35,63 +36,65 @@ def _preload_and_load_models(): # new function for loading models
|
|
35 |
).to('cuda') # Explicitly move to CUDA after loading
|
36 |
|
37 |
|
38 |
-
if
|
39 |
-
mp.set_start_method('spawn') #
|
40 |
-
p = mp.Process(target=_preload_and_load_models)
|
41 |
p.start()
|
42 |
-
p.join()
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
66 |
# Use Bark's generate_audio function directly
|
67 |
-
|
68 |
|
69 |
# Scale and convert audio (as before)
|
70 |
-
|
71 |
|
72 |
# Save audio for debugging
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
6 |
import numpy as np
|
7 |
from bark import SAMPLE_RATE, generate_audio, preload_models
|
8 |
import torch.multiprocessing as mp # Import multiprocessing
|
9 |
+
import os
|
10 |
|
11 |
# Load Whisper and Vicuna models (as before)
|
12 |
ASR_MODEL_NAME = "openai/whisper-medium.en"
|
|
|
36 |
).to('cuda') # Explicitly move to CUDA after loading
|
37 |
|
38 |
|
39 |
+
if "HF_SPACE_ID" in os.environ: # checking if we are in HF spaces
|
40 |
+
mp.set_start_method('spawn', force=True) # Set start method ONLY in Spaces, force if needed
|
41 |
+
p = mp.Process(target=_preload_and_load_models)
|
42 |
p.start()
|
43 |
+
p.join()
|
44 |
+
else: # if not in spaces just load the models
|
45 |
+
_preload_and_load_models()
|
46 |
+
|
47 |
+
@spaces.GPU(required=True)
|
48 |
+
def process_audio(microphone, state, task="transcribe"):
|
49 |
+
if microphone is None:
|
50 |
+
return state, state, None
|
51 |
+
|
52 |
+
asr_pipe.model.config.forced_decoder_ids = [
|
53 |
+
[2, transcribe_token_id if task == "transcribe" else translate_token_id]
|
54 |
+
]
|
55 |
+
text = asr_pipe(microphone)["text"]
|
56 |
+
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
|
57 |
+
You answer questions clearly and simply, using age-appropriate language.
|
58 |
+
You are also a little bit silly and like to make jokes."""
|
59 |
+
prompt = f"{system_prompt}\nUser: {text}"
|
60 |
+
|
61 |
+
with torch.no_grad():
|
62 |
+
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
|
63 |
+
vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
|
64 |
+
vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
|
65 |
+
vicuna_response = vicuna_response.replace(prompt, "").strip()
|
66 |
+
updated_state = state + "\n" + vicuna_response
|
67 |
+
|
68 |
+
try:
|
69 |
# Use Bark's generate_audio function directly
|
70 |
+
audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
|
71 |
|
72 |
# Scale and convert audio (as before)
|
73 |
+
audio_arr = (audio_arr * 32767).astype(np.int16)
|
74 |
|
75 |
# Save audio for debugging
|
76 |
+
sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
|
77 |
+
|
78 |
+
audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
|
79 |
+
|
80 |
+
except Exception as e:
|
81 |
+
print(f"Error in speech synthesis: {e}")
|
82 |
+
audio_output = None
|
83 |
+
|
84 |
+
return updated_state, updated_state, audio_output
|
85 |
+
|
86 |
+
with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
|
87 |
+
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
|
88 |
+
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
|
89 |
+
with gr.Tab("Transcribe & Synthesize"):
|
90 |
+
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
|
91 |
+
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
|
92 |
+
audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
|
93 |
+
transcription_state = gr.State(value="")
|
94 |
+
mic_input.change(
|
95 |
+
fn=process_audio, # Call the combined function
|
96 |
+
inputs=[mic_input, transcription_state],
|
97 |
+
outputs=[transcription_output, transcription_state, audio_output]
|
98 |
+
)
|
99 |
+
|
100 |
+
demo.launch(share=False)
|