vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 10

Commit

b56cef1

verified ·

1 Parent(s): 4d170e9

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -19

app.py CHANGED Viewed

@@ -1,9 +1,9 @@
 import torch
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig, AutoProcessor
 import soundfile as sf
 import numpy as np
-import importlib  # Import the importlib module
 # --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
@@ -19,35 +19,49 @@ transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
 # --- FastSpeech2 (TTS) Setup ---
-TTS_MODEL_NAME = "ford442/fastspeech2-en-ljspeech"  # OR "facebook/fastspeech2-en-ljspeech" after PR
-# 1. Load the config.  We DO need trust_remote_code here, and we explain why below.
-tts_config = AutoConfig.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)
-# 2. Dynamically import the model class.  This is *still* the correct way.
-module_name = tts_config.architectures[0]
-module = importlib.import_module(f"transformers.models.{tts_config.model_type}")  # Corrected module path
-model_class = getattr(module, tts_config.architectures[0])
-# 3. Load the processor and model.
-tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True) # Keep this for now
-tts_model = model_class.from_pretrained(TTS_MODEL_NAME, config=tts_config)
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"
 tts_model = tts_model.to(tts_device)
 # --- Vicuna (LLM) Setup ---
-VICUNA_MODEL_NAME = "lmsys/vicuna-33b-v1.3"  # Use a smaller model if needed
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     VICUNA_MODEL_NAME,
-    load_in_8bit=False,
-    torch_dtype=torch.float32,
     device_map="auto",
 )
 # --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:

 import torch
 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoProcessor
 import soundfile as sf
 import numpy as np
+import importlib
 # --- Whisper (ASR) Setup ---
 ASR_MODEL_NAME = "openai/whisper-large-v2"
 translate_token_id = all_special_ids[-6]
 # --- FastSpeech2 (TTS) Setup ---
+TTS_MODEL_NAME = "ford442/fastspeech2-en-ljspeech"  # OR "facebook/fastspeech2-en-ljspeech"
+# 1. Load the processor (we still need trust_remote_code for this)
+tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)
+# 2. Load the model using the *custom* modeling file. This is the key.
+#    We CANNOT use AutoConfig or AutoModel here.
+model_file_path = f"models--{TTS_MODEL_NAME.replace('/', '--')}/snapshots"
+import os
+# Find the commit hash - this is needed because of the way Hugging Face caches models.
+for d in os.listdir(os.path.expanduser(f"~/.cache/huggingface/hub/{model_file_path}")):
+    if os.path.isdir(os.path.expanduser(f"~/.cache/huggingface/hub/{model_file_path}/{d}")) and not d.startswith("."):
+      commit_hash = d
+      break
+else:
+    raise ValueError ("Cannot find the model")
+model_file_path += f"/{commit_hash}/modeling_fastspeech2.py"
+# Use importlib to import the custom modeling file.
+spec = importlib.util.spec_from_file_location("modeling_fastspeech2", os.path.expanduser(f"~/.cache/huggingface/hub/{model_file_path}"))
+fastspeech2_module = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(fastspeech2_module)
+tts_model = fastspeech2_module.FastSpeech2.from_pretrained(TTS_MODEL_NAME) #Use the actual class name!
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"
 tts_model = tts_model.to(tts_device)
 # --- Vicuna (LLM) Setup ---
+VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"  # Use a smaller model if needed
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     VICUNA_MODEL_NAME,
+    load_in_8bit=True,
+    torch_dtype=torch.float16,
     device_map="auto",
 )
+# --- ASR and TTS Functions (and Gradio Interface) ---
+# (Same as before, but using tts_model and tts_processor)
 # --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None: