vicuna-clip

Running on Zero

App Files Files Community

ford442 commited on Feb 11

Commit

821d0bc

verified ·

1 Parent(s): 2048877

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -17

app.py CHANGED Viewed

@@ -4,8 +4,9 @@ import gradio as gr
 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
 import soundfile as sf
 import numpy as np
-import requests
-import os
 # Load Whisper model
 ASR_MODEL_NAME = "openai/whisper-medium.en"
@@ -32,11 +33,28 @@ def _preload_and_load_models():
 _preload_and_load_models()
 @spaces.GPU(required=True)
 def process_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state, None
     asr_pipe.model.config.forced_decoder_ids = [
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
@@ -45,27 +63,15 @@ def process_audio(microphone, state, task="transcribe"):
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
         vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True) # Access the first sequence [0]
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     try:
-        API_URL = "https://api-inference.huggingface.co/models/espnet/kan-bayashi_ljspeech_vits"
-        headers = {"Authorization": f"Bearer {os.environ['HUGGINGFACEHUB_API_TOKEN']}"}
-        payloads = {'inputs': vicuna_response}  # Use Vicuna's response for TTS
-        response = requests.post(API_URL, headers=headers, json=payloads)
-        response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)
-        audio_data = response.content
-        # Convert bytes to numpy array (adjust sampling rate if needed)
-        audio_arr = np.frombuffer(audio_data, dtype=np.int16)  # Assumes 16-bit PCM
-        SAMPLE_RATE = 22050 # Common for this model; you might need to check the actual value
-        audio_arr = audio_arr.reshape(-1, 1).astype(np.float32) / np.iinfo(np.int16).max # Normalize
-        audio_arr = audio_arr.flatten() # Make it 1D
         audio_output = (SAMPLE_RATE, audio_arr)
         #sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
     except requests.exceptions.RequestException as e:

 from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModel
 import soundfile as sf
 import numpy as np
+# Import the TTS pipeline
+from espnet2.bin.tts_inference import Text2Speech
+from espnet2.utils.types import get_fastspeech_config
 # Load Whisper model
 ASR_MODEL_NAME = "openai/whisper-medium.en"
 _preload_and_load_models()
+# Load the TTS model locally
+TTS_MODEL_PATH = "path/to/your/espnet/kan-bayashi_ljspeech_vits"  # Replace with the actual path
+TTS_CONFIG_PATH = os.path.join(TTS_MODEL_PATH, "config.yaml") # Replace with your config.yaml
+TTS_VOCAB_PATH = os.path.join(TTS_MODEL_PATH, "train.json") # Replace with your train.json
+tts = Text2Speech(
+    TTS_MODEL_PATH,
+    TTS_CONFIG_PATH,
+    TTS_VOCAB_PATH,
+    device="cuda" if torch.cuda.is_available() else "cpu",
+    # You can customize the speed and other parameters here if needed
+)
+fastspeech_config = get_fastspeech_config(TTS_CONFIG_PATH)
 @spaces.GPU(required=True)
 def process_audio(microphone, state, task="transcribe"):
     if microphone is None:
         return state, state, None
     asr_pipe.model.config.forced_decoder_ids = [
         [2, transcribe_token_id if task == "transcribe" else translate_token_id]
     ]
         You answer questions clearly and simply, using age-appropriate language.
         You are also a little bit silly and like to make jokes."""
     prompt = f"{system_prompt}\nUser: {text}"
     with torch.no_grad():
         vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to('cuda')
         vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=192)
         vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True) # Access the first sequence [0]
         vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     try:
+        wav, sr = tts([vicuna_response])[0]
+        audio_arr = wav.cpu().numpy()
         audio_output = (SAMPLE_RATE, audio_arr)
         #sf.write('generated_audio.wav', audio_arr, SAMPLE_RATE)
     except requests.exceptions.RequestException as e: