vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 10

Commit

8213d9e

verified ·

1 Parent(s): b56cef1

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -40

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import torch
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
 import soundfile as sf
 import numpy as np
-import importlib
 # --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
@@ -18,38 +20,26 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
-# --- FastSpeech2 (TTS) Setup ---
-TTS_MODEL_NAME = "ford442/fastspeech2-en-ljspeech"  # OR "facebook/fastspeech2-en-ljspeech"
-# 1. Load the processor (we still need trust_remote_code for this)
-tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)
-# 2. Load the model using the *custom* modeling file. This is the key.
-#    We CANNOT use AutoConfig or AutoModel here.
-model_file_path = f"models--{TTS_MODEL_NAME.replace('/', '--')}/snapshots"
-import os
-# Find the commit hash - this is needed because of the way Hugging Face caches models.
-for d in os.listdir(os.path.expanduser(f"~/.cache/huggingface/hub/{model_file_path}")):
-    if os.path.isdir(os.path.expanduser(f"~/.cache/huggingface/hub/{model_file_path}/{d}")) and not d.startswith("."):
-      commit_hash = d
-      break
-else:
-    raise ValueError ("Cannot find the model")
-model_file_path += f"/{commit_hash}/modeling_fastspeech2.py"
-# Use importlib to import the custom modeling file.
-spec = importlib.util.spec_from_file_location("modeling_fastspeech2", os.path.expanduser(f"~/.cache/huggingface/hub/{model_file_path}"))
-fastspeech2_module = importlib.util.module_from_spec(spec)
-spec.loader.exec_module(fastspeech2_module)
-tts_model = fastspeech2_module.FastSpeech2.from_pretrained(TTS_MODEL_NAME) #Use the actual class name!
-tts_device = "cuda" if torch.cuda.is_available() else "cpu"
-tts_model = tts_model.to(tts_device)
 # --- Vicuna (LLM) Setup ---
-VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"  # Use a smaller model if needed
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
@@ -59,9 +49,6 @@ vicuna_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
 )
-# --- ASR and TTS Functions (and Gradio Interface) ---
-# (Same as before, but using tts_model and tts_processor)
 # --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
@@ -87,16 +74,22 @@ def transcribe_audio(microphone, state, task="transcribe"):
     updated_state = state + "\n" + vicuna_response
     return updated_state, updated_state
-# --- TTS Function ---
 def synthesize_speech(text):
     try:
-        inputs = tts_processor(text=text, return_tensors="pt")
-        inputs = {key: value.to(tts_device) for key, value in inputs.items()}
-        with torch.no_grad():
-            output = tts_model(**inputs).waveform  # Use .waveform
-        output = output.cpu()
-        waveform = output.squeeze().numpy()
-        return (tts_processor.feature_extractor.sampling_rate, waveform)
     except Exception as e:
         print(e)
         return (None, None)

 import torch
 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
 import soundfile as sf
 import numpy as np
+from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub
+from fairseq.models.text_to_speech.hub_interface import TTSHubInterface
+import IPython.display as ipd  # We still need this if running in a notebook
 # --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
+# --- FastSpeech2 (TTS) Setup - Using fairseq ---
+TTS_MODEL_NAME = "facebook/fastspeech2-en-ljspeech"
+tts_device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load the fairseq model, config, and task.
+tts_models, tts_cfg, tts_task = load_model_ensemble_and_task_from_hf_hub(
+    TTS_MODEL_NAME,
+    arg_overrides={"vocoder": "hifigan", "fp16": False}
+)
+tts_model = tts_models[0]
+TTSHubInterface.update_cfg_with_data_cfg(tts_cfg, tts_task.data_cfg)
+tts_generator = tts_task.build_generator(tts_model, tts_cfg)
+# Move the fairseq model to the correct device.
+tts_model.to(tts_device)
+tts_model.eval() # Put the model in evaluation mode
 # --- Vicuna (LLM) Setup ---
+VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"  # Or your preferred Vicuna
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
 )
 # --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
     updated_state = state + "\n" + vicuna_response
     return updated_state, updated_state
+# --- TTS Function (Modified for fairseq) ---
 def synthesize_speech(text):
     try:
+        sample = TTSHubInterface.get_model_input(tts_task, text)
+        # Move input tensors to the correct device
+        if torch.cuda.is_available():
+          sample['net_input'] = {k: v.cuda() for k, v in sample['net_input'].items()}
+        else:
+          sample['net_input'] = {k: v.cpu() for k, v in sample['net_input'].items()}
+        wav, rate = TTSHubInterface.get_prediction(tts_task, tts_model, tts_generator, sample)
+        wav_numpy = wav.cpu().numpy() # fairseq returns a tensor, not a numpy array
+        return (rate, wav_numpy)  # Return rate and NumPy array
     except Exception as e:
         print(e)
         return (None, None)