vicuna-clip

Running on Zero

App Files Files Community

ford442 commited on Feb 10

Commit

447f99a

verified ·

1 Parent(s): 2dc1223

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -27

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import torch
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, SpeechEncoderDecoderModel, AutoProcessor, FastSpeech2Config, FastSpeech2Model
 import soundfile as sf
 import numpy as np
@@ -17,26 +17,23 @@ all_special_ids = asr_pipe.tokenizer.all_special_ids
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
 # --- FastSpeech2 (TTS) Setup ---
 TTS_MODEL_NAME = "facebook/fastspeech2-en-ljspeech"
-# Try loading the processor and config with trust_remote_code
-try:
-  tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)
-  tts_config = FastSpeech2Config.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)
-  tts_model = SpeechEncoderDecoderModel.from_pretrained(TTS_MODEL_NAME, config=tts_config, trust_remote_code=True)
-except ValueError as e:
-    print(f"Error loading with trust_remote_code: {e}")
-    # Fallback to manual loading (explained below)
-    exit() # Stop if we can't load
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"
 tts_model = tts_model.to(tts_device)
 # --- Vicuna (LLM) Setup ---
-VICUNA_MODEL_NAME = "lmsys/vicuna-33b-v1.3"
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     VICUNA_MODEL_NAME,
@@ -45,9 +42,6 @@ vicuna_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
 )
-# --- ASR and TTS Functions (and Gradio Interface) ---
-# (Rest of your code - transcribe_audio, synthesize_speech, Gradio setup)
-# ... (same as before, but using tts_model, tts_processor, and tts_config) ...
 # --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
@@ -64,11 +58,11 @@ def transcribe_audio(microphone, state, task="transcribe"):
     prompt = f"{system_prompt}\nUser: {text}"
-    with torch.no_grad():  # Disable gradient calculation
-      vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(vicuna_device)
-      vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)  # Limit response length
-      vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
-      vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     return updated_state, updated_state
@@ -79,7 +73,7 @@ def synthesize_speech(text):
         inputs = tts_processor(text=text, return_tensors="pt")
         inputs = {key: value.to(tts_device) for key, value in inputs.items()}
         with torch.no_grad():
-            output = tts_model.generate(**inputs)
         output = output.cpu()
         waveform = output.squeeze().numpy()
         return (tts_processor.feature_extractor.sampling_rate, waveform)
@@ -88,13 +82,13 @@ def synthesize_speech(text):
         return (None, None)
 # --- Gradio Interface ---
-with gr.Blocks(title="Whisper, Vicuna, & FastSpeech2 Demo") as demo:  # More descriptive title
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
     with gr.Tab("Transcribe & Synthesize"):
         mic_input = gr.Audio(source="microphone", type="filepath", optional=True, label="Speak Here")
-        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")  # Combined output
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
         transcription_state = gr.State(value="")
@@ -104,8 +98,8 @@ with gr.Blocks(title="Whisper, Vicuna, & FastSpeech2 Demo") as demo:  # More des
             outputs=[transcription_output, transcription_state]
         ).then(
             fn=synthesize_speech,
-            inputs=transcription_output,  # Use the combined output as input for TTS
             outputs=audio_output
         )
-demo.launch(enable_queue=True)

 import torch
 import gradio as gr
+from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoConfig
 import soundfile as sf
 import numpy as np
 transcribe_token_id = all_special_ids[-5]
 translate_token_id = all_special_ids[-6]
 # --- FastSpeech2 (TTS) Setup ---
 TTS_MODEL_NAME = "facebook/fastspeech2-en-ljspeech"
+# Load the config (we'll need it for the model class)
+tts_config = AutoConfig.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)
+# Load the processor and model, using trust_remote_code
+tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME, trust_remote_code=True)
+tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_NAME, config=tts_config, trust_remote_code=True)
 tts_device = "cuda" if torch.cuda.is_available() else "cpu"
 tts_model = tts_model.to(tts_device)
 # --- Vicuna (LLM) Setup ---
+VICUNA_MODEL_NAME = "lmsys/vicuna-33b-v1.3"  # Or a smaller Vicuna model
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
 vicuna_model = AutoModelForCausalLM.from_pretrained(
     VICUNA_MODEL_NAME,
     device_map="auto",
 )
 # --- ASR Function ---
 def transcribe_audio(microphone, state, task="transcribe"):
     if microphone is None:
     prompt = f"{system_prompt}\nUser: {text}"
+    with torch.no_grad():
+        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(vicuna_device)
+        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
+        vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
+        vicuna_response = vicuna_response.replace(prompt, "").strip()
     updated_state = state + "\n" + vicuna_response
     return updated_state, updated_state
         inputs = tts_processor(text=text, return_tensors="pt")
         inputs = {key: value.to(tts_device) for key, value in inputs.items()}
         with torch.no_grad():
+            output = tts_model(**inputs).waveform # Use the model directly, it outputs a waveform
         output = output.cpu()
         waveform = output.squeeze().numpy()
         return (tts_processor.feature_extractor.sampling_rate, waveform)
         return (None, None)
 # --- Gradio Interface ---
+with gr.Blocks(title="Whisper, Vicuna, & FastSpeech2 Demo") as demo:
     gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna")
     gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
     with gr.Tab("Transcribe & Synthesize"):
         mic_input = gr.Audio(source="microphone", type="filepath", optional=True, label="Speak Here")
+        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
         audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
         transcription_state = gr.State(value="")
             outputs=[transcription_output, transcription_state]
         ).then(
             fn=synthesize_speech,
+            inputs=transcription_output,
             outputs=audio_output
         )
+demo.launch(enable_queue=True, share=False) # share=False is usually better for local development