ford442 commited on
Commit
3c50a05
·
verified ·
1 Parent(s): aa64e74

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -14
app.py CHANGED
@@ -1,12 +1,12 @@
1
  import spaces
2
  import torch
3
  import gradio as gr
4
- from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel, AutoProcessor
5
- import soundfile as sf
6
  import numpy as np
7
- import IPython.display as ipd
8
- import os
9
 
 
10
  ASR_MODEL_NAME = "openai/whisper-medium.en"
11
  asr_pipe = pipeline(
12
  task="automatic-speech-recognition",
@@ -19,10 +19,6 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
19
  transcribe_token_id = all_special_ids[-5]
20
  translate_token_id = all_special_ids[-6]
21
 
22
- TTS_MODEL_NAME = "suno/bark"
23
- tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
24
- tts_model = AutoModel.from_pretrained(TTS_MODEL_NAME).to('cuda')
25
-
26
  VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
27
  vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
28
  vicuna_model = AutoModelForCausalLM.from_pretrained(
@@ -31,10 +27,14 @@ vicuna_model = AutoModelForCausalLM.from_pretrained(
31
  device_map="auto",
32
  )
33
 
 
 
 
34
  @spaces.GPU(required=True)
35
  def process_audio(microphone, state, task="transcribe"):
36
  if microphone is None:
37
  return state, state, None
 
38
  asr_pipe.model.config.forced_decoder_ids = [
39
  [2, transcribe_token_id if task == "transcribe" else translate_token_id]
40
  ]
@@ -43,21 +43,30 @@ def process_audio(microphone, state, task="transcribe"):
43
  You answer questions clearly and simply, using age-appropriate language.
44
  You are also a little bit silly and like to make jokes."""
45
  prompt = f"{system_prompt}\nUser: {text}"
 
46
  with torch.no_grad():
47
  vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
48
  vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
49
- vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
50
  vicuna_response = vicuna_response.replace(prompt, "").strip()
51
  updated_state = state + "\n" + vicuna_response
 
52
  try:
53
- with torch.no_grad():
54
- inputs = tts_processor(vicuna_response, return_tensors="pt").to('cuda')
55
- output = tts_model.generate(**inputs, do_sample=False)
56
- waveform_np = output[0].cpu().numpy()
57
- audio_output = (tts_model.generation_config.sample_rate, waveform_np)
 
 
 
 
 
 
58
  except Exception as e:
59
  print(f"Error in speech synthesis: {e}")
60
  audio_output = None
 
61
  return updated_state, updated_state, audio_output
62
 
63
  with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo:
 
1
  import spaces
2
  import torch
3
  import gradio as gr
4
+ from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel # Removed AutoProcessor
5
+ import soundfile as sf # For saving audio (debugging)
6
  import numpy as np
7
+ from bark import SAMPLE_RATE, generate_audio, preload_models # Import Bark functions
 
8
 
9
+ # Load Whisper and Vicuna models (as before)
10
  ASR_MODEL_NAME = "openai/whisper-medium.en"
11
  asr_pipe = pipeline(
12
  task="automatic-speech-recognition",
 
19
  transcribe_token_id = all_special_ids[-5]
20
  translate_token_id = all_special_ids[-6]
21
 
 
 
 
 
22
  VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
23
  vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
24
  vicuna_model = AutoModelForCausalLM.from_pretrained(
 
27
  device_map="auto",
28
  )
29
 
30
+ # Preload Bark models (crucial for efficiency)
31
+ preload_models() # No need for the DEBUG_MODE check here; preload always
32
+
33
  @spaces.GPU(required=True)
34
  def process_audio(microphone, state, task="transcribe"):
35
  if microphone is None:
36
  return state, state, None
37
+
38
  asr_pipe.model.config.forced_decoder_ids = [
39
  [2, transcribe_token_id if task == "transcribe" else translate_token_id]
40
  ]
 
43
  You answer questions clearly and simply, using age-appropriate language.
44
  You are also a little bit silly and like to make jokes."""
45
  prompt = f"{system_prompt}\nUser: {text}"
46
+
47
  with torch.no_grad():
48
  vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
49
  vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
50
+ vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)
51
  vicuna_response = vicuna_response.replace(prompt, "").strip()
52
  updated_state = state + "\n" + vicuna_response
53
+
54
  try:
55
+ # Use Bark's generate_audio function directly
56
+ audio_arr = generate_audio(vicuna_response) #, history_prompt=None - if needed
57
+
58
+ # Scale and convert audio (as before)
59
+ audio_arr = (audio_arr * 32767).astype(np.int16)
60
+
61
+ # Save audio for debugging
62
+ sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
63
+
64
+ audio_output = (SAMPLE_RATE, audio_arr) # Use the correct SAMPLE_RATE
65
+
66
  except Exception as e:
67
  print(f"Error in speech synthesis: {e}")
68
  audio_output = None
69
+
70
  return updated_state, updated_state, audio_output
71
 
72
  with gr.Blocks(title="Whisper, Vicuna, & Bark Demo") as demo: