vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 11

Commit

e0628e1

verified ·

1 Parent(s): 9bd6434

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -54

app.py CHANGED Viewed

@@ -6,6 +6,7 @@ import soundfile as sf
 import numpy as np
 from bark import SAMPLE_RATE, generate_audio, preload_models
 import torch.multiprocessing as mp  # Import multiprocessing
 # Load Whisper and Vicuna models (as before)
 ASR_MODEL_NAME = "openai/whisper-medium.en"
@@ -35,63 +36,65 @@ def _preload_and_load_models(): # new function for loading models
     ).to('cuda') # Explicitly move to CUDA after loading
-if __name__ == "__main__": # important for multiprocessing to work
-    mp.set_start_method('spawn')  # Important for Spaces
-    p = mp.Process(target=_preload_and_load_models) # new process to load models
     p.start()
-    p.join() # wait for the models to load
-    @spaces.GPU(required=True)
-    def process_audio(microphone, state, task="transcribe"):
-        if microphone is None:
-            return state, state, None
-        asr_pipe.model.config.forced_decoder_ids = [
-            [2, transcribe_token_id if task == "transcribe" else translate_token_id]
-        ]
-        text = asr_pipe(microphone)["text"]
-        system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
-            You answer questions clearly and simply, using age-appropriate language.
-            You are also a little bit silly and like to make jokes."""
-        prompt = f"{system_prompt}\nUser: {text}"
-        with torch.no_grad():
-            vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
-            vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
-            vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
-            vicuna_response = vicuna_response.replace(prompt, "").strip()
-        updated_state = state + "\n" + vicuna_response
-        try:
             # Use Bark's generate_audio function directly
-            audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
             # Scale and convert audio (as before)
-            audio_arr = (audio_arr * 32767).astype(np.int16)
             # Save audio for debugging
-            sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
-            audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE
-        except Exception as e:
-            print(f"Error in speech synthesis: {e}")
-            audio_output = None
-        return updated_state, updated_state, audio_output
-    with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
-        gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
-        gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
-        with gr.Tab("Transcribe & Synthesize"):
-            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
-            transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
-            audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
-            transcription_state = gr.State(value="")
-            mic_input.change(
-                fn=process_audio,  # Call the combined function
-                inputs=[mic_input, transcription_state],
-                outputs=[transcription_output, transcription_state, audio_output]
-            )
-    demo.launch(share=False)

 import numpy as np
 from bark import SAMPLE_RATE, generate_audio, preload_models
 import torch.multiprocessing as mp  # Import multiprocessing
+import os
 # Load Whisper and Vicuna models (as before)
 ASR_MODEL_NAME = "openai/whisper-medium.en"
     ).to('cuda') # Explicitly move to CUDA after loading
+if "HF_SPACE_ID" in os.environ: # checking if we are in HF spaces
+    mp.set_start_method('spawn', force=True)  # Set start method ONLY in Spaces, force if needed
+    p = mp.Process(target=_preload_and_load_models)
     p.start()
+    p.join()
+else: # if not in spaces just load the models
+    _preload_and_load_models()
+@spaces.GPU(required=True)
+def process_audio(microphone, state, task="transcribe"):
+    if microphone is None:
+        return state, state, None
+    asr_pipe.model.config.forced_decoder_ids = [
+        [2, transcribe_token_id if task == "transcribe" else translate_token_id]
+    ]
+    text = asr_pipe(microphone)["text"]
+    system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
+        You answer questions clearly and simply, using age-appropriate language.
+        You are also a little bit silly and like to make jokes."""
+    prompt = f"{system_prompt}\nUser: {text}"
+    with torch.no_grad():
+        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
+        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
+        vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
+        vicuna_response = vicuna_response.replace(prompt, "").strip()
+    updated_state = state + "\n" + vicuna_response
+    try:
             # Use Bark's generate_audio function directly
+        audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
             # Scale and convert audio (as before)
+        audio_arr = (audio_arr * 32767).astype(np.int16)
             # Save audio for debugging
+        sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
+        audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE
+    except Exception as e:
+        print(f"Error in speech synthesis: {e}")
+        audio_output = None
+    return updated_state, updated_state, audio_output
+with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
+    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
+    gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
+    with gr.Tab("Transcribe & Synthesize"):
+        mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
+        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
+        audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
+        transcription_state = gr.State(value="")
+        mic_input.change(
+            fn=process_audio,  # Call the combined function
+            inputs=[mic_input, transcription_state],
+            outputs=[transcription_output, transcription_state, audio_output]
+        )
+demo.launch(share=False)