vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 10

Commit

c249a04

verified ·

1 Parent(s): 6bf9fab

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -26

app.py CHANGED Viewed

@@ -7,41 +7,31 @@ import numpy as np
 import IPython.display as ipd
 import os
-# --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
-asr_device = "cuda" if torch.cuda.is_available() else "cpu"
 asr_pipe = pipeline(
     task="automatic-speech-recognition",
     model=ASR_MODEL_NAME,
     chunk_length_s=30,
-    device=asr_device,
 )
 all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
-# --- Bark (TTS) Setup ---
 TTS_MODEL_NAME = "suno/bark-small"
-tts_device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load the Bark model and processor
 tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
-tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to(tts_device)
-# --- Vicuna (LLM) Setup ---
 VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
-vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     VICUNA_MODEL_NAME,
-    # load_in_8bit=True,  # Remove 8-bit quantization (no bitsandbytes)
     torch_dtype=torch.float16,  # Use float16 for efficiency (if GPU supports it)
     device_map="auto",  # Let transformers handle device placement
-)
-# --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state
@@ -49,48 +39,37 @@ def transcribe_audio(microphone, state, task="transcribe"):
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
     text = asr_pipe(microphone)["text"]
-    # --- VICUNA INTEGRATION ---
     system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
       You answer questions clearly and simply, using age-appropriate language.
       You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(vicuna_device)
         vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     return updated_state, updated_state
-# --- TTS Function (Using Bark) ---
 def synthesize_speech(text):
     try:
         with torch.no_grad():
             inputs = tts_processor(text, return_tensors="pt").to(tts_device)
             output = tts_model.generate(**inputs, do_sample=True) #Bark generate
         waveform_np = output[0].cpu().numpy()
         return (tts_model.generation_config.sample_rate, waveform_np) #Bark sample rate
     except Exception as e:
         print(e)
         return (None, None)
-# --- Gradio Interface ---
 with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
     with gr.Tab("Transcribe & Synthesize"):
         mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
         transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
         transcription_state = gr.State(value="")
         mic_input.change(
             fn=transcribe_audio,
             inputs=[mic_input, transcription_state],

 import IPython.display as ipd
 import os
 ASR_MODEL_NAME = "openai/whisper-large-v2"
 asr_pipe = pipeline(
     task="automatic-speech-recognition",
     model=ASR_MODEL_NAME,
     chunk_length_s=30,
+    device='cuda',
 )
 all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
 TTS_MODEL_NAME = "suno/bark-small"
 tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
+tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to('cuda')
 VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     VICUNA_MODEL_NAME,
     torch_dtype=torch.float16,  # Use float16 for efficiency (if GPU supports it)
     device_map="auto",  # Let transformers handle device placement
+).to('cuda')
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
     text = asr_pipe(microphone)["text"]
     system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
       You answer questions clearly and simply, using age-appropriate language.
       You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(vicuna_device)
         vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     return updated_state, updated_state
 def synthesize_speech(text):
     try:
         with torch.no_grad():
             inputs = tts_processor(text, return_tensors="pt").to(tts_device)
             output = tts_model.generate(**inputs, do_sample=True) #Bark generate
         waveform_np = output[0].cpu().numpy()
         return (tts_model.generation_config.sample_rate, waveform_np) #Bark sample rate
     except Exception as e:
         print(e)
         return (None, None)
 with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Bark")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
     with gr.Tab("Transcribe & Synthesize"):
         mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
         transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
         transcription_state = gr.State(value="")
         mic_input.change(
             fn=transcribe_audio,
             inputs=[mic_input, transcription_state],