Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -21,12 +21,9 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
|
|
21 |
transcribe_token_id = all_special_ids[-5]
|
22 |
translate_token_id = all_special_ids[-6]
|
23 |
|
24 |
-
def _preload_and_load_models():
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
# Vicuna model loading (now inside the function)
|
29 |
-
global vicuna_tokenizer, vicuna_model # make global to be used in process_audio
|
30 |
VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
|
31 |
vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
|
32 |
vicuna_model = AutoModelForCausalLM.from_pretrained(
|
@@ -35,66 +32,76 @@ def _preload_and_load_models(): # new function for loading models
|
|
35 |
device_map="auto", # or.to('cuda')
|
36 |
).to('cuda') # Explicitly move to CUDA after loading
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
-
if "
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
|
|
46 |
|
47 |
-
@spaces.GPU(required=True)
|
48 |
-
def process_audio(microphone, state, task="transcribe"):
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
# Use Bark's generate_audio function directly
|
70 |
-
|
71 |
|
72 |
# Scale and convert audio (as before)
|
73 |
-
|
74 |
|
75 |
# Save audio for debugging
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
demo.launch(share=False)
|
|
|
21 |
transcribe_token_id = all_special_ids[-5]
|
22 |
translate_token_id = all_special_ids[-6]
|
23 |
|
24 |
+
def _preload_and_load_models():
|
25 |
+
global vicuna_tokenizer, vicuna_model
|
26 |
+
# Load Vicuna (as before)
|
|
|
|
|
|
|
27 |
VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
|
28 |
vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
|
29 |
vicuna_model = AutoModelForCausalLM.from_pretrained(
|
|
|
32 |
device_map="auto", # or.to('cuda')
|
33 |
).to('cuda') # Explicitly move to CUDA after loading
|
34 |
|
35 |
+
# Bark model loading (modified)
|
36 |
+
from bark.models import (
|
37 |
+
BARK_V0_MODEL_NAMES,
|
38 |
+
BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME,
|
39 |
+
) # Import model names
|
40 |
+
|
41 |
+
from bark.generation import preload_models as _preload_models # rename the function
|
42 |
+
_preload_models(BARK_V0_MODEL_NAMES + [BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME]) # load models
|
43 |
+
|
44 |
|
45 |
+
if __name__ == "__main__":
|
46 |
+
if "HF_SPACE_ID" in os.environ:
|
47 |
+
mp.set_start_method('spawn', force=True)
|
48 |
+
p = mp.Process(target=_preload_and_load_models)
|
49 |
+
p.start()
|
50 |
+
p.join()
|
51 |
+
else:
|
52 |
+
_preload_and_load_models()
|
53 |
|
54 |
+
@spaces.GPU(required=True)
|
55 |
+
def process_audio(microphone, state, task="transcribe"):
|
56 |
+
if microphone is None:
|
57 |
+
return state, state, None
|
58 |
+
|
59 |
+
asr_pipe.model.config.forced_decoder_ids = [
|
60 |
+
[2, transcribe_token_id if task == "transcribe" else translate_token_id]
|
61 |
+
]
|
62 |
+
text = asr_pipe(microphone)["text"]
|
63 |
+
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
|
64 |
+
You answer questions clearly and simply, using age-appropriate language.
|
65 |
+
You are also a little bit silly and like to make jokes."""
|
66 |
+
prompt = f"{system_prompt}\nUser: {text}"
|
67 |
+
|
68 |
+
with torch.no_grad():
|
69 |
+
vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
|
70 |
+
vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
|
71 |
+
vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
|
72 |
+
vicuna_response = vicuna_response.replace(prompt, "").strip()
|
73 |
+
updated_state = state + "\n" + vicuna_response
|
74 |
+
|
75 |
+
try:
|
76 |
# Use Bark's generate_audio function directly
|
77 |
+
audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
|
78 |
|
79 |
# Scale and convert audio (as before)
|
80 |
+
audio_arr = (audio_arr * 32767).astype(np.int16)
|
81 |
|
82 |
# Save audio for debugging
|
83 |
+
sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
|
84 |
+
|
85 |
+
audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
|
86 |
+
|
87 |
+
except Exception as e:
|
88 |
+
print(f"Error in speech synthesis: {e}")
|
89 |
+
audio_output = None
|
90 |
+
|
91 |
+
return updated_state, updated_state, audio_output
|
92 |
+
|
93 |
+
with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
|
94 |
+
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
|
95 |
+
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
|
96 |
+
with gr.Tab("Transcribe & Synthesize"):
|
97 |
+
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
|
98 |
+
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
|
99 |
+
audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
|
100 |
+
transcription_state = gr.State(value="")
|
101 |
+
mic_input.change(
|
102 |
+
fn=process_audio, # Call the combined function
|
103 |
+
inputs=[mic_input, transcription_state],
|
104 |
+
outputs=[transcription_output, transcription_state, audio_output]
|
105 |
+
)
|
106 |
+
|
107 |
+
demo.launch(share=False)
|