vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 11

Commit

06fb866

verified ·

1 Parent(s): e0628e1

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -62

app.py CHANGED Viewed

@@ -21,12 +21,9 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
-def _preload_and_load_models(): # new function for loading models
-    # Preload Bark models (now inside the function)
-    preload_models()
-    # Vicuna model loading (now inside the function)
-    global vicuna_tokenizer, vicuna_model # make global to be used in process_audio
     VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
     vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
     vicuna_model = AutoModelForCausalLM.from_pretrained(
@@ -35,66 +32,76 @@ def _preload_and_load_models(): # new function for loading models
         device_map="auto", # or.to('cuda')
     ).to('cuda') # Explicitly move to CUDA after loading
-if "HF_SPACE_ID" in os.environ: # checking if we are in HF spaces
-    mp.set_start_method('spawn', force=True)  # Set start method ONLY in Spaces, force if needed
-    p = mp.Process(target=_preload_and_load_models)
-    p.start()
-    p.join()
-else: # if not in spaces just load the models
-    _preload_and_load_models()
-@spaces.GPU(required=True)
-def process_audio(microphone, state, task="transcribe"):
-    if microphone is None:
-        return state, state, None
-    asr_pipe.model.config.forced_decoder_ids = [
-        [2, transcribe_token_id if task == "transcribe" else translate_token_id]
-    ]
-    text = asr_pipe(microphone)["text"]
-    system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
-        You answer questions clearly and simply, using age-appropriate language.
-        You are also a little bit silly and like to make jokes."""
-    prompt = f"{system_prompt}\nUser: {text}"
-    with torch.no_grad():
-        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
-        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
-        vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
-        vicuna_response = vicuna_response.replace(prompt, "").strip()
-    updated_state = state + "\n" + vicuna_response
-    try:
             # Use Bark's generate_audio function directly
-        audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
             # Scale and convert audio (as before)
-        audio_arr = (audio_arr * 32767).astype(np.int16)
             # Save audio for debugging
-        sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
-        audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE
-    except Exception as e:
-        print(f"Error in speech synthesis: {e}")
-        audio_output = None
-    return updated_state, updated_state, audio_output
-with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
-    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
-    gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
-    with gr.Tab("Transcribe & Synthesize"):
-        mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
-        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
-        audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
-        transcription_state = gr.State(value="")
-        mic_input.change(
-            fn=process_audio,  # Call the combined function
-            inputs=[mic_input, transcription_state],
-            outputs=[transcription_output, transcription_state, audio_output]
-        )
-demo.launch(share=False)

 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
+def _preload_and_load_models():
+    global vicuna_tokenizer, vicuna_model
+    # Load Vicuna (as before)
     VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
     vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
     vicuna_model = AutoModelForCausalLM.from_pretrained(
         device_map="auto", # or.to('cuda')
     ).to('cuda') # Explicitly move to CUDA after loading
+    # Bark model loading (modified)
+    from bark.models import (
+        BARK_V0_MODEL_NAMES,
+        BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME,
+    )  # Import model names
+    from bark.generation import preload_models as _preload_models # rename the function
+    _preload_models(BARK_V0_MODEL_NAMES + [BARK_V0_SPEAKER_EMBEDDING_MODEL_NAME]) # load models
+if __name__ == "__main__":
+    if "HF_SPACE_ID" in os.environ:
+        mp.set_start_method('spawn', force=True)
+        p = mp.Process(target=_preload_and_load_models)
+        p.start()
+        p.join()
+    else:
+        _preload_and_load_models()
+    @spaces.GPU(required=True)
+    def process_audio(microphone, state, task="transcribe"):
+        if microphone is None:
+            return state, state, None
+        asr_pipe.model.config.forced_decoder_ids = [
+            [2, transcribe_token_id if task == "transcribe" else translate_token_id]
+        ]
+        text = asr_pipe(microphone)["text"]
+        system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
+            You answer questions clearly and simply, using age-appropriate language.
+            You are also a little bit silly and like to make jokes."""
+        prompt = f"{system_prompt}\nUser: {text}"
+        with torch.no_grad():
+            vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
+            vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
+            vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
+            vicuna_response = vicuna_response.replace(prompt, "").strip()
+        updated_state = state + "\n" + vicuna_response
+        try:
             # Use Bark's generate_audio function directly
+            audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
             # Scale and convert audio (as before)
+            audio_arr = (audio_arr * 32767).astype(np.int16)
             # Save audio for debugging
+            sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
+            audio_output = (SAMPLE_RATE, audio_arr)  # Use the correct SAMPLE_RATE
+        except Exception as e:
+            print(f"Error in speech synthesis: {e}")
+            audio_output = None
+        return updated_state, updated_state, audio_output
+    with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
+        gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
+        gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
+        with gr.Tab("Transcribe & Synthesize"):
+            mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
+            transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
+            audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
+            transcription_state = gr.State(value="")
+            mic_input.change(
+                fn=process_audio,  # Call the combined function
+                inputs=[mic_input, transcription_state],
+                outputs=[transcription_output, transcription_state, audio_output]
+            )
+    demo.launch(share=False)