ford442 commited on
Commit
b1622fb
·
verified ·
1 Parent(s): 7694c3e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -68
app.py CHANGED
@@ -1,10 +1,11 @@
1
  import spaces
2
  import torch
3
  import gradio as gr
4
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel # Removed AutoProcessor
5
- import soundfile as sf # For saving audio (debugging)
6
  import numpy as np
7
- from bark import SAMPLE_RATE, generate_audio, preload_models # Import Bark functions
 
8
 
9
  # Load Whisper and Vicuna models (as before)
10
  ASR_MODEL_NAME = "openai/whisper-medium.en"
@@ -19,70 +20,78 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
19
  transcribe_token_id = all_special_ids[-5]
20
  translate_token_id = all_special_ids[-6]
21
 
22
- #VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
23
- VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"
24
-
25
- vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
26
- vicuna_model = AutoModelForCausalLM.from_pretrained(
27
- VICUNA_MODEL_NAME,
28
- torch_dtype=torch.float16,
29
- device_map="auto",
30
- )
31
-
32
- # Preload Bark models (crucial for efficiency)
33
- preload_models() # No need for the DEBUG_MODE check here; preload always
34
-
35
- @spaces.GPU(required=True)
36
- def process_audio(microphone, state, task="transcribe"):
37
- if microphone is None:
38
- return state, state, None
39
-
40
- asr_pipe.model.config.forced_decoder_ids = [
41
- [2, transcribe_token_id if task == "transcribe" else translate_token_id]
42
- ]
43
- text = asr_pipe(microphone)["text"]
44
- system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
45
- You answer questions clearly and simply, using age-appropriate language.
46
- You are also a little bit silly and like to make jokes."""
47
- prompt = f"{system_prompt}\nUser: {text}"
48
-
49
- with torch.no_grad():
50
- vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
51
- vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
52
- vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
53
- vicuna_response = vicuna_response.replace(prompt, "").strip()
54
- updated_state = state + "\n" + vicuna_response
55
-
56
- try:
57
- # Use Bark's generate_audio function directly
58
- audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
59
-
60
- # Scale and convert audio (as before)
61
- audio_arr = (audio_arr * 32767).astype(np.int16)
62
-
63
- # Save audio for debugging
64
- sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
65
-
66
- audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
67
-
68
- except Exception as e:
69
- print(f"Error in speech synthesis: {e}")
70
- audio_output = None
71
-
72
- return updated_state, updated_state, audio_output
73
-
74
- with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
75
- gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
76
- gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
77
- with gr.Tab("Transcribe & Synthesize"):
78
- mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
79
- transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
80
- audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
81
- transcription_state = gr.State(value="")
82
- mic_input.change(
83
- fn=process_audio, # Call the combined function
84
- inputs=[mic_input, transcription_state],
85
- outputs=[transcription_output, transcription_state, audio_output]
86
- )
 
 
 
 
 
 
 
 
87
 
88
  demo.launch(share=False)
 
1
  import spaces
2
  import torch
3
  import gradio as gr
4
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
5
+ import soundfile as sf
6
  import numpy as np
7
+ from bark import SAMPLE_RATE, generate_audio, preload_models
8
+ import torch.multiprocessing as mp # Import multiprocessing
9
 
10
  # Load Whisper and Vicuna models (as before)
11
  ASR_MODEL_NAME = "openai/whisper-medium.en"
 
20
  transcribe_token_id = all_special_ids[-5]
21
  translate_token_id = all_special_ids[-6]
22
 
23
+ def _preload_and_load_models(): # new function for loading models
24
+ # Preload Bark models (now inside the function)
25
+ preload_models()
26
+
27
+ # Vicuna model loading (now inside the function)
28
+ global vicuna_tokenizer, vicuna_model # make global to be used in process_audio
29
+ VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
30
+ vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
31
+ vicuna_model = AutoModelForCausalLM.from_pretrained(
32
+ VICUNA_MODEL_NAME,
33
+ torch_dtype=torch.float16,
34
+ device_map="auto", # or.to('cuda')
35
+ ).to('cuda') # Explicitly move to CUDA after loading
36
+
37
+
38
+ if __name__ == "__main__": # important for multiprocessing to work
39
+ mp.set_start_method('spawn') # Important for Spaces
40
+ p = mp.Process(target=_preload_and_load_models) # new process to load models
41
+ p.start()
42
+ p.join() # wait for the models to load
43
+
44
+ @spaces.GPU(required=True)
45
+ def process_audio(microphone, state, task="transcribe"):
46
+ if microphone is None:
47
+ return state, state, None
48
+
49
+ asr_pipe.model.config.forced_decoder_ids = [
50
+ [2, transcribe_token_id if task == "transcribe" else translate_token_id]
51
+ ]
52
+ text = asr_pipe(microphone)["text"]
53
+ system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
54
+ You answer questions clearly and simply, using age-appropriate language.
55
+ You are also a little bit silly and like to make jokes."""
56
+ prompt = f"{system_prompt}\nUser: {text}"
57
+
58
+ with torch.no_grad():
59
+ vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
60
+ vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
61
+ vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
62
+ vicuna_response = vicuna_response.replace(prompt, "").strip()
63
+ updated_state = state + "\n" + vicuna_response
64
+
65
+ try:
66
+ # Use Bark's generate_audio function directly
67
+ audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
68
+
69
+ # Scale and convert audio (as before)
70
+ audio_arr = (audio_arr * 32767).astype(np.int16)
71
+
72
+ # Save audio for debugging
73
+ sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
74
+
75
+ audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
76
+
77
+ except Exception as e:
78
+ print(f"Error in speech synthesis: {e}")
79
+ audio_output = None
80
+
81
+ return updated_state, updated_state, audio_output
82
+
83
+ with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
84
+ gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
85
+ gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
86
+ with gr.Tab("Transcribe & Synthesize"):
87
+ mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
88
+ transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
89
+ audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
90
+ transcription_state = gr.State(value="")
91
+ mic_input.change(
92
+ fn=process_audio, # Call the combined function
93
+ inputs=[mic_input, transcription_state],
94
+ outputs=[transcription_output, transcription_state, audio_output]
95
+ )
96
 
97
  demo.launch(share=False)