vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 12

Commit

69cfc54

verified ·

1 Parent(s): 590e946

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -9

app.py CHANGED Viewed

@@ -35,8 +35,8 @@ def _preload_and_load_models():
     #VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
     #VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5"  # Or another model
     VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"  # Or another model
-    vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
-    vicuna_model = AutoModelForCausalLM.from_pretrained(
         VICUNA_MODEL_NAME,
         torch_dtype=torch.float16,
      #   device_map="auto", # or.to('cuda')
@@ -60,16 +60,21 @@ def process_audio(microphone, state, task="transcribe"):
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
-        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
     try:
-        with torch.no_grad():
-            output = tts(vicuna_response)
-            wav = output["wav"]
-            sr = tts.fs
-            audio_arr = wav.cpu().numpy()
         SAMPLE_RATE = sr
         audio_arr = audio_arr / np.abs(audio_arr).max()
         audio_output = (SAMPLE_RATE, audio_arr)
@@ -89,10 +94,11 @@ with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo:  # Updated title
         mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
         transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
         transcription_state = gr.State(value="")
         mic_input.change(
             fn=process_audio,
-            inputs=[mic_input, transcription_state, gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")],
             outputs=[transcription_output, transcription_state, audio_output]
         )

     #VICUNA_MODEL_NAME = "EleutherAI/gpt-neo-2.7B"  # Or another model
     #VICUNA_MODEL_NAME = "lmsys/vicuna-13b-v1.5"  # Or another model
     VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"  # Or another model
+    vicuna_tokenizer = LlamaTokenizer.from_pretrained(VICUNA_MODEL_NAME)
+    vicuna_model = LlamaForCausalLM.from_pretrained(
         VICUNA_MODEL_NAME,
         torch_dtype=torch.float16,
      #   device_map="auto", # or.to('cuda')
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
+        vicuna_output = vicuna_model.generate(
+            **vicuna_input,
+            max_length = 96,
+            min_new_tokens = 64,
+            do_sample = True
+        )
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\nUser: " + text + "\n" + "Tutor: " + vicuna_response
     try:
+        #with torch.no_grad():
+        output = tts(vicuna_response)
+        wav = output["wav"]
+        sr = tts.fs
+        audio_arr = wav.cpu().numpy()
         SAMPLE_RATE = sr
         audio_arr = audio_arr / np.abs(audio_arr).max()
         audio_output = (SAMPLE_RATE, audio_arr)
         mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
         transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
+        audio_output = gr.Radio(["transcribe", "translate"]
         transcription_state = gr.State(value="")
         mic_input.change(
             fn=process_audio,
+            inputs=[mic_input, transcription_state, , label="Task", value="transcribe")],
             outputs=[transcription_output, transcription_state, audio_output]
         )