vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 10

Commit

df3b410

verified ·

1 Parent(s): 6199585

Update app.py

Browse files

Files changed (1) hide show

app.py +17 -11

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import torch
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoModelForTextToSpeech, AutoProcessor
 import soundfile as sf
 import numpy as np
 # --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
@@ -18,17 +19,26 @@ transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
 # --- FastSpeech2 (TTS) Setup ---
-#  Use your fork, or the original if/when the change is merged.
-TTS_MODEL_NAME = "ford442/fastspeech2-en-ljspeech"  # OR "facebook/fastspeech2-en-ljspeech"
-# Now we can use AutoModelForTextToSpeech!
 tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
-tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_NAME)
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"
 tts_model = tts_model.to(tts_device)
 # --- Vicuna (LLM) Setup ---
-VICUNA_MODEL_NAME = "lmsys/vicuna-33b-v1.3"  # Or your preferred Vicuna model
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
@@ -38,10 +48,6 @@ vicuna_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
 )
-# --- ASR and TTS Functions (and Gradio Interface) ---
-# (Rest of your code - transcribe_audio, synthesize_speech, Gradio setup)
-# ... (same as before, but using tts_model, tts_processor) ...
 # --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
@@ -73,7 +79,7 @@ def synthesize_speech(text):
         inputs = tts_processor(text=text, return_tensors="pt")
         inputs = {key: value.to(tts_device) for key, value in inputs.items()}
         with torch.no_grad():
-          output = tts_model(**inputs).waveform
         output = output.cpu()
         waveform = output.squeeze().numpy()
         return (tts_processor.feature_extractor.sampling_rate, waveform)

 import torch
 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoProcessor
 import soundfile as sf
 import numpy as np
+import importlib  # Import the importlib module
 # --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
 translate_token_id = all_special_ids[-6]
 # --- FastSpeech2 (TTS) Setup ---
+TTS_MODEL_NAME = "your_username/fastspeech2-en-ljspeech"  # OR "facebook/fastspeech2-en-ljspeech" after PR
+# 1. Load the config (now it should exist!)
+tts_config = AutoConfig.from_pretrained(TTS_MODEL_NAME)
+# 2. Dynamically import the model class. This is the correct way.
+module_name = tts_config.architectures[0]  # Get model class name from config
+module = importlib.import_module(f".{tts_config._name_or_path}", package="transformers.models")
+model_class = getattr(module, tts_config.architectures[0])
+# 3. Load the processor and model.
 tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
+tts_model = model_class.from_pretrained(TTS_MODEL_NAME, config=tts_config)
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"
 tts_model = tts_model.to(tts_device)
 # --- Vicuna (LLM) Setup ---
+VICUNA_MODEL_NAME = "lmsys/vicuna-33b-v1.3"  # Use a smaller model if needed
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
 )
 # --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
         inputs = tts_processor(text=text, return_tensors="pt")
         inputs = {key: value.to(tts_device) for key, value in inputs.items()}
         with torch.no_grad():
+            output = tts_model(**inputs).waveform  # Use .waveform
         output = output.cpu()
         waveform = output.squeeze().numpy()
         return (tts_processor.feature_extractor.sampling_rate, waveform)