ford442 commited on
Commit
e0628e1
·
verified ·
1 Parent(s): 9bd6434

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -54
app.py CHANGED
@@ -6,6 +6,7 @@ import soundfile as sf
6
  import numpy as np
7
  from bark import SAMPLE_RATE, generate_audio, preload_models
8
  import torch.multiprocessing as mp # Import multiprocessing
 
9
 
10
  # Load Whisper and Vicuna models (as before)
11
  ASR_MODEL_NAME = "openai/whisper-medium.en"
@@ -35,63 +36,65 @@ def _preload_and_load_models(): # new function for loading models
35
  ).to('cuda') # Explicitly move to CUDA after loading
36
 
37
 
38
- if __name__ == "__main__": # important for multiprocessing to work
39
- mp.set_start_method('spawn') # Important for Spaces
40
- p = mp.Process(target=_preload_and_load_models) # new process to load models
41
  p.start()
42
- p.join() # wait for the models to load
43
-
44
- @spaces.GPU(required=True)
45
- def process_audio(microphone, state, task="transcribe"):
46
- if microphone is None:
47
- return state, state, None
48
-
49
- asr_pipe.model.config.forced_decoder_ids = [
50
- [2, transcribe_token_id if task == "transcribe" else translate_token_id]
51
- ]
52
- text = asr_pipe(microphone)["text"]
53
- system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
54
- You answer questions clearly and simply, using age-appropriate language.
55
- You are also a little bit silly and like to make jokes."""
56
- prompt = f"{system_prompt}\nUser: {text}"
57
-
58
- with torch.no_grad():
59
- vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
60
- vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
61
- vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
62
- vicuna_response = vicuna_response.replace(prompt, "").strip()
63
- updated_state = state + "\n" + vicuna_response
64
-
65
- try:
 
 
66
  # Use Bark's generate_audio function directly
67
- audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
68
 
69
  # Scale and convert audio (as before)
70
- audio_arr = (audio_arr * 32767).astype(np.int16)
71
 
72
  # Save audio for debugging
73
- sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
74
-
75
- audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
76
-
77
- except Exception as e:
78
- print(f"Error in speech synthesis: {e}")
79
- audio_output = None
80
-
81
- return updated_state, updated_state, audio_output
82
-
83
- with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
84
- gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
85
- gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
86
- with gr.Tab("Transcribe & Synthesize"):
87
- mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
88
- transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
89
- audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
90
- transcription_state = gr.State(value="")
91
- mic_input.change(
92
- fn=process_audio, # Call the combined function
93
- inputs=[mic_input, transcription_state],
94
- outputs=[transcription_output, transcription_state, audio_output]
95
- )
96
-
97
- demo.launch(share=False)
 
6
  import numpy as np
7
  from bark import SAMPLE_RATE, generate_audio, preload_models
8
  import torch.multiprocessing as mp # Import multiprocessing
9
+ import os
10
 
11
  # Load Whisper and Vicuna models (as before)
12
  ASR_MODEL_NAME = "openai/whisper-medium.en"
 
36
  ).to('cuda') # Explicitly move to CUDA after loading
37
 
38
 
39
+ if "HF_SPACE_ID" in os.environ: # checking if we are in HF spaces
40
+ mp.set_start_method('spawn', force=True) # Set start method ONLY in Spaces, force if needed
41
+ p = mp.Process(target=_preload_and_load_models)
42
  p.start()
43
+ p.join()
44
+ else: # if not in spaces just load the models
45
+ _preload_and_load_models()
46
+
47
+ @spaces.GPU(required=True)
48
+ def process_audio(microphone, state, task="transcribe"):
49
+ if microphone is None:
50
+ return state, state, None
51
+
52
+ asr_pipe.model.config.forced_decoder_ids = [
53
+ [2, transcribe_token_id if task == "transcribe" else translate_token_id]
54
+ ]
55
+ text = asr_pipe(microphone)["text"]
56
+ system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
57
+ You answer questions clearly and simply, using age-appropriate language.
58
+ You are also a little bit silly and like to make jokes."""
59
+ prompt = f"{system_prompt}\nUser: {text}"
60
+
61
+ with torch.no_grad():
62
+ vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
63
+ vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
64
+ vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
65
+ vicuna_response = vicuna_response.replace(prompt, "").strip()
66
+ updated_state = state + "\n" + vicuna_response
67
+
68
+ try:
69
  # Use Bark's generate_audio function directly
70
+ audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
71
 
72
  # Scale and convert audio (as before)
73
+ audio_arr = (audio_arr * 32767).astype(np.int16)
74
 
75
  # Save audio for debugging
76
+ sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
77
+
78
+ audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
79
+
80
+ except Exception as e:
81
+ print(f"Error in speech synthesis: {e}")
82
+ audio_output = None
83
+
84
+ return updated_state, updated_state, audio_output
85
+
86
+ with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
87
+ gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
88
+ gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
89
+ with gr.Tab("Transcribe & Synthesize"):
90
+ mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
91
+ transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
92
+ audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
93
+ transcription_state = gr.State(value="")
94
+ mic_input.change(
95
+ fn=process_audio, # Call the combined function
96
+ inputs=[mic_input, transcription_state],
97
+ outputs=[transcription_output, transcription_state, audio_output]
98
+ )
99
+
100
+ demo.launch(share=False)