vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 11

Commit

b1622fb

verified ·

1 Parent(s): 7694c3e

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -68

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import spaces
 import torch
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel  # Removed AutoProcessor
-import soundfile as sf  # For saving audio (debugging)
 import numpy as np
-from bark import SAMPLE_RATE, generate_audio, preload_models  # Import Bark functions
 # Load Whisper and Vicuna models (as before)
 ASR_MODEL_NAME = "openai/whisper-medium.en"
@@ -19,70 +20,78 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
-#VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
-VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"
-vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
-vicuna_model = AutoModelForCausalLM.from_pretrained(
-    VICUNA_MODEL_NAME,
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
-# Preload Bark models (crucial for efficiency)
-preload_models()  # No need for the DEBUG_MODE check here; preload always
-@spaces.GPU(required=True)
-def process_audio(microphone, state, task="transcribe"):
-    if microphone is None:
-        return state, state, None
-    asr_pipe.model.config.forced_decoder_ids = [
-        [2, transcribe_token_id if task == "transcribe" else translate_token_id]
-    ]
-    text = asr_pipe(microphone)["text"]
-    system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
-        You answer questions clearly and simply, using age-appropriate language.
-        You are also a little bit silly and like to make jokes."""
-    prompt = f"{system_prompt}\nUser: {text}"
-    with torch.no_grad():
-        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
-        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
-        vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
-        vicuna_response = vicuna_response.replace(prompt, "").strip()
-    updated_state = state + "\n" + vicuna_response
-    try:
-        # Use Bark's generate_audio function directly
-        audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
-        # Scale and convert audio (as before)
-        audio_arr = (audio_arr * 32767).astype(np.int16)
-        # Save audio for debugging
-        sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
-        audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE
-    except Exception as e:
-        print(f"Error in speech synthesis: {e}")
-        audio_output = None
-    return updated_state, updated_state, audio_output
-with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
-    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
-    gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
-    with gr.Tab("Transcribe & Synthesize"):
-        mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
-        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
-        audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
-        transcription_state = gr.State(value="")
-        mic_input.change(
-            fn=process_audio,  # Call the combined function
-            inputs=[mic_input, transcription_state],
-            outputs=[transcription_output, transcription_state, audio_output]
-        )
 demo.launch(share=False)

 import spaces
 import torch
 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
+import soundfile as sf
 import numpy as np
+from bark import SAMPLE_RATE, generate_audio, preload_models
+import torch.multiprocessing as mp  # Import multiprocessing
 # Load Whisper and Vicuna models (as before)
 ASR_MODEL_NAME = "openai/whisper-medium.en"
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
+def _preload_and_load_models(): # new function for loading models
+    # Preload Bark models (now inside the function)
+    preload_models()
+    # Vicuna model loading (now inside the function)
+    global vicuna_tokenizer, vicuna_model # make global to be used in process_audio
+    VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
+    vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
+    vicuna_model = AutoModelForCausalLM.from_pretrained(
+        VICUNA_MODEL_NAME,
+        torch_dtype=torch.float16,
+        device_map="auto", # or.to('cuda')
+    ).to('cuda') # Explicitly move to CUDA after loading
+if __name__ == "__main__": # important for multiprocessing to work
+    mp.set_start_method('spawn')  # Important for Spaces
+    p = mp.Process(target=_preload_and_load_models) # new process to load models
+    p.start()
+    p.join() # wait for the models to load
+    @spaces.GPU(required=True)
+    def process_audio(microphone, state, task="transcribe"):
+        if microphone is None:
+            return state, state, None
+        asr_pipe.model.config.forced_decoder_ids = [
+            [2, transcribe_token_id if task == "transcribe" else translate_token_id]
+        ]
+        text = asr_pipe(microphone)["text"]
+        system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
+            You answer questions clearly and simply, using age-appropriate language.
+            You are also a little bit silly and like to make jokes."""
+        prompt = f"{system_prompt}\nUser: {text}"
+        with torch.no_grad():
+            vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
+            vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
+            vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
+            vicuna_response = vicuna_response.replace(prompt, "").strip()
+        updated_state = state + "\n" + vicuna_response
+        try:
+            # Use Bark's generate_audio function directly
+            audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
+            # Scale and convert audio (as before)
+            audio_arr = (audio_arr * 32767).astype(np.int16)
+            # Save audio for debugging
+            sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
+            audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE
+        except Exception as e:
+            print(f"Error in speech synthesis: {e}")
+            audio_output = None
+        return updated_state, updated_state, audio_output
+    with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
+        gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
+        gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
+        with gr.Tab("Transcribe & Synthesize"):
+            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
+            transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
+            audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
+            transcription_state = gr.State(value="")
+            mic_input.change(
+                fn=process_audio,  # Call the combined function
+                inputs=[mic_input, transcription_state],
+                outputs=[transcription_output, transcription_state, audio_output]
+            )
 demo.launch(share=False)