vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 10

Commit

03d2efe

verified ·

1 Parent(s): 31c2346

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -46

app.py CHANGED Viewed

@@ -1,13 +1,8 @@
 import torch
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 import soundfile as sf
 import numpy as np
-import fairseq
-import IPython.display as ipd
-import os  # Import the 'os' module
-commit_hash = "8798153927c22132778bef7b507d389474fa3589"  # Example - find a suitable one!
 # --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
@@ -22,27 +17,16 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
-# --- FastSpeech2 (TTS) Setup - Using fairseq 0.10.2 ---
-TTS_MODEL_NAME = "facebook/fastspeech2-en-ljspeech"
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"
-# Download the model files if they don't exist
-if not os.path.exists("fastspeech2_model"):
-    os.makedirs("fastspeech2_model")
-    print("Downloading FastSpeech2 model...")
-    os.system(f"wget https://huggingface.co/{TTS_MODEL_NAME}/resolve/{commit_hash}/pytorch_model.pt -O fastspeech2_model/pytorch_model.pt")
-    os.system(f"wget https://huggingface.co/{TTS_MODEL_NAME}/resolve/{commit_hash}/vocab.txt -O fastspeech2_model/vocab.txt")
-    print("Download complete.")
-# Load the model using fairseq 0.10.2 compatible methods.
-tts_model_path = "fastspeech2_model/pytorch_model.pt"  # Path to the downloaded model
-tts_model, tts_cfg, tts_task = fairseq.checkpoint_utils.load_model_ensemble_and_task([tts_model_path])
-tts_model = tts_model[0]
-tts_model.to(tts_device)
-tts_model.eval()
 # --- Vicuna (LLM) Setup ---
-VICUNA_MODEL_NAME = "lmsys/vicuna-33b-v1.3"  # Use a smaller model if needed
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
@@ -76,39 +60,28 @@ def transcribe_audio(microphone, state, task="transcribe"):
     updated_state = state + "\n" + vicuna_response
     return updated_state, updated_state
-# --- TTS Function ---
 def synthesize_speech(text):
     try:
-        # Preprocess using fairseq's task.
-        sample = tts_task.build_dataset_for_inference([text], [len(text)])
-        # Move to device
-        if tts_device == 'cuda':
-          sample = fairseq.utils.move_to_cuda(sample)
-        else:
-          sample = sample
-        # Generate
-        generator = tts_task.build_generator([tts_model], tts_cfg.task) # Pass the task
-        output = generator.generate([tts_model], sample) # Generate using the generator
-        # Extract waveform and sample rate.
-        waveform = output[0][0]['waveform']
-        sample_rate = tts_cfg.task.sample_rate # Get the rate
-        # Convert to NumPy (and ensure CPU)
         waveform_np = waveform.cpu().numpy()
-        return (sample_rate, waveform_np)
     except Exception as e:
         print(e)
         return (None, None)
 # --- Gradio Interface ---
-with gr.Blocks(title="Whisper, Vicuna, & FastSpeech2 Demo") as demo:
-    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
     with gr.Tab("Transcribe & Synthesize"):

 import torch
 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModelForTextToSpeech, AutoProcessor
 import soundfile as sf
 import numpy as np
 # --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
+# --- VITS (TTS) Setup - Using transformers ---
+TTS_MODEL_NAME = "espnet/kan_bayashi_ljspeech_vits"  # Changed to VITS model
+tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
+tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_NAME)
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"
+tts_model = tts_model.to(tts_device)
 # --- Vicuna (LLM) Setup ---
+VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"  # Or your preferred Vicuna
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     updated_state = state + "\n" + vicuna_response
     return updated_state, updated_state
+# --- TTS Function (Simplified for VITS) ---
 def synthesize_speech(text):
     try:
+        inputs = tts_processor(text=text, return_tensors="pt")
+        inputs = {key: value.to(tts_device) for key, value in inputs.items()}
+        with torch.no_grad():
+            output = tts_model(**inputs).spectrogram  # VITS models often output a spectrogram
+            # Convert spectrogram to waveform using the vocoder
+            waveform = tts_model.vocoder(output).squeeze()
         waveform_np = waveform.cpu().numpy()
+        #VITS models use a sample rate of 22050
+        return (22050, waveform_np)
     except Exception as e:
         print(e)
         return (None, None)
 # --- Gradio Interface ---
+with gr.Blocks(title="Whisper, Vicuna, & VITS Demo") as demo:  # Updated title
+    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and VITS")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
     with gr.Tab("Transcribe & Synthesize"):