ford442 commited on
Commit
06fb866
·
verified ·
1 Parent(s): e0628e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -62
app.py CHANGED
@@ -21,12 +21,9 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
21
  transcribe_token_id = all_special_ids[-5]
22
  translate_token_id = all_special_ids[-6]
23
 
24
- def _preload_and_load_models(): # new function for loading models
25
- # Preload Bark models (now inside the function)
26
- preload_models()
27
-
28
- # Vicuna model loading (now inside the function)
29
- global vicuna_tokenizer, vicuna_model # make global to be used in process_audio
30
  VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
31
  vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
32
  vicuna_model = AutoModelForCausalLM.from_pretrained(
@@ -35,66 +32,76 @@ def _preload_and_load_models(): # new function for loading models
35
  device_map="auto", # or.to('cuda')
36
  ).to('cuda') # Explicitly move to CUDA after loading
37
 
 
 
 
 
 
 
 
 
 
38
 
39
- if "HF_SPACE_ID" in os.environ: # checking if we are in HF spaces
40
- mp.set_start_method('spawn', force=True) # Set start method ONLY in Spaces, force if needed
41
- p = mp.Process(target=_preload_and_load_models)
42
- p.start()
43
- p.join()
44
- else: # if not in spaces just load the models
45
- _preload_and_load_models()
 
46
 
47
- @spaces.GPU(required=True)
48
- def process_audio(microphone, state, task="transcribe"):
49
- if microphone is None:
50
- return state, state, None
51
-
52
- asr_pipe.model.config.forced_decoder_ids = [
53
- [2, transcribe_token_id if task == "transcribe" else translate_token_id]
54
- ]
55
- text = asr_pipe(microphone)["text"]
56
- system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
57
- You answer questions clearly and simply, using age-appropriate language.
58
- You are also a little bit silly and like to make jokes."""
59
- prompt = f"{system_prompt}\nUser: {text}"
60
-
61
- with torch.no_grad():
62
- vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
63
- vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
64
- vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
65
- vicuna_response = vicuna_response.replace(prompt, "").strip()
66
- updated_state = state + "\n" + vicuna_response
67
-
68
- try:
69
  # Use Bark's generate_audio function directly
70
- audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
71
 
72
  # Scale and convert audio (as before)
73
- audio_arr = (audio_arr * 32767).astype(np.int16)
74
 
75
  # Save audio for debugging
76
- sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
77
-
78
- audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
79
-
80
- except Exception as e:
81
- print(f"Error in speech synthesis: {e}")
82
- audio_output = None
83
-
84
- return updated_state, updated_state, audio_output
85
-
86
- with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
87
- gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
88
- gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
89
- with gr.Tab("Transcribe & Synthesize"):
90
- mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
91
- transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
92
- audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
93
- transcription_state = gr.State(value="")
94
- mic_input.change(
95
- fn=process_audio, # Call the combined function
96
- inputs=[mic_input, transcription_state],
97
- outputs=[transcription_output, transcription_state, audio_output]
98
- )
99
-
100
- demo.launch(share=False)
 
21
  transcribe_token_id = all_special_ids[-5]
22
  translate_token_id = all_special_ids[-6]
23
 
24
+ def _preload_and_load_models():
25
+ global vicuna_tokenizer, vicuna_model
26
+ # Load Vicuna (as before)
 
 
 
27
  VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B" # Or another model
28
  vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
29
  vicuna_model = AutoModelForCausalLM.from_pretrained(
 
32
  device_map="auto", # or.to('cuda')
33
  ).to('cuda') # Explicitly move to CUDA after loading
34
 
35
+ # Bark model loading (modified)
36
+ from bark.models import (
37
+ BARK_V0_MODEL_NAMES,
38
+ BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME,
39
+ ) # Import model names
40
+
41
+ from bark.generation import preload_models as _preload_models # rename the function
42
+ _preload_models(BARK_V0_MODEL_NAMES + [BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME]) # load models
43
+
44
 
45
+ if __name__ == "__main__":
46
+ if "HF_SPACE_ID" in os.environ:
47
+ mp.set_start_method('spawn', force=True)
48
+ p = mp.Process(target=_preload_and_load_models)
49
+ p.start()
50
+ p.join()
51
+ else:
52
+ _preload_and_load_models()
53
 
54
+ @spaces.GPU(required=True)
55
+ def process_audio(microphone, state, task="transcribe"):
56
+ if microphone is None:
57
+ return state, state, None
58
+
59
+ asr_pipe.model.config.forced_decoder_ids = [
60
+ [2, transcribe_token_id if task == "transcribe" else translate_token_id]
61
+ ]
62
+ text = asr_pipe(microphone)["text"]
63
+ system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
64
+ You answer questions clearly and simply, using age-appropriate language.
65
+ You are also a little bit silly and like to make jokes."""
66
+ prompt = f"{system_prompt}\nUser: {text}"
67
+
68
+ with torch.no_grad():
69
+ vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
70
+ vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
71
+ vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
72
+ vicuna_response = vicuna_response.replace(prompt, "").strip()
73
+ updated_state = state + "\n" + vicuna_response
74
+
75
+ try:
76
  # Use Bark's generate_audio function directly
77
+ audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
78
 
79
  # Scale and convert audio (as before)
80
+ audio_arr = (audio_arr * 32767).astype(np.int16)
81
 
82
  # Save audio for debugging
83
+ sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
84
+
85
+ audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
86
+
87
+ except Exception as e:
88
+ print(f"Error in speech synthesis: {e}")
89
+ audio_output = None
90
+
91
+ return updated_state, updated_state, audio_output
92
+
93
+ with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
94
+ gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
95
+ gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
96
+ with gr.Tab("Transcribe & Synthesize"):
97
+ mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
98
+ transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
99
+ audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
100
+ transcription_state = gr.State(value="")
101
+ mic_input.change(
102
+ fn=process_audio, # Call the combined function
103
+ inputs=[mic_input, transcription_state],
104
+ outputs=[transcription_output, transcription_state, audio_output]
105
+ )
106
+
107
+ demo.launch(share=False)