vicuna-clip

Sleeping

App Files Files Community

ford442 commited on Feb 10

Commit

29b4682

verified ·

1 Parent(s): ebed5e1

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -77

app.py CHANGED Viewed

@@ -1,35 +1,18 @@
-import spaces
-import torch
 import gradio as gr
-from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
-import soundfile as sf
-import numpy as np
-from espnet2.bin.tts_inference import Text2Speech
-import IPython.display as ipd
 import os
-from huggingface_hub import snapshot_download
-# ... (Whisper and Vicuna setup remain the same)
-# --- VITS (TTS) Setup ---
-TTS_MODEL_NAME = "espnet/speechlm_tts_v1"  # Updated Model Name
-tts_device = "cuda" if torch.cuda.is_available() else "cpu"
-model_dir = "speechlm_model" # Updated directory name
-if os.path.exists(model_dir):
-    shutil.rmtree(model_dir)
-os.makedirs(model_dir)
-download_path = snapshot_download(repo_id=TTS_MODEL_NAME, local_dir=model_dir, local_dir_use_symlinks=False)
-print(f"Downloaded ESPnet model to: {download_path}")
-# --- KEY CHANGE: Adjust paths for speechlm_tts_v1 ---
-config_path = os.path.join(download_path, "exp/speechlm_tts_v1/config.yaml")  # Correct path for speechlm_tts_v1
-model_path = os.path.join(download_path, "exp/speechlm_tts_v1/model.pth")  # Correct path for speechlm_tts_v1
-tts_model = Text2Speech(train_config=config_path, model_file=model_path, device=tts_device)
-# --- Vicuna (LLM) Setup ---
 VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
@@ -40,62 +23,115 @@ vicuna_model = AutoModelForCausalLM.from_pretrained(
     device_map="auto",
 )
-# --- ASR Function ---
-def transcribe_audio(microphone, state, task="transcribe"):
-    if microphone is None:
-        return state, state
-    asr_pipe.model.config.forced_decoder_ids = [
-        [2, transcribe_token_id if task == "transcribe" else translate_token_id]
-    ]
-    text = asr_pipe(microphone)["text"]
-    # --- VICUNA INTEGRATION ---
-    system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
-      You answer questions clearly and simply, using age-appropriate language.
-      You are also a little bit silly and like to make jokes."""
-    prompt = f"{system_prompt}\nUser: {text}"
-    with torch.no_grad():
-        vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(vicuna_device)
-        vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=128)
-        vicuna_response = vicuna_tokenizer.decode(vicuna_output[0], skip_special_tokens=True)
-        vicuna_response = vicuna_response.replace(prompt, "").strip()
-    updated_state = state + "\n" + vicuna_response
-    return updated_state, updated_state
-# --- TTS Function (Using espnet2) ---
-def synthesize_speech(text):
     try:
         with torch.no_grad():
-            output = tts_model(text)
-        waveform_np = output["wav"].cpu().numpy()
-        return (tts_model.fs, waveform_np)
     except Exception as e:
-        print(e)
-        return (None, None)
 # --- Gradio Interface ---
-with gr.Blocks(title="Whisper, Vicuna, & VITS Demo") as demo:
-    gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and VITS")
-    gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
-    with gr.Tab("Transcribe & Synthesize"):
-        mic_input = gr.Audio(source="microphone", type="filepath", optional=True, label="Speak Here")
-        transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
-        audio_output = gr.Audio(label="Synthesized Speech", type="numpy")
-        transcription_state = gr.State(value="")
-        mic_input.change(
-            fn=transcribe_audio,
-            inputs=[mic_input, transcription_state],
-            outputs=[transcription_output, transcription_state]
-        ).then(
-            fn=synthesize_speech,
-            inputs=transcription_output,
-            outputs=audio_output
-        )
-demo.launch(enable_queue=True, share=False)

 import gradio as gr
+import torchaudio
+import torch
+import torch.nn.functional as F
+from speechbrain.inference.speaker import EncoderClassifier
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, AutoTokenizer, AutoModelForCausalLM
+import noisereduce as nr
+import librosa
 import os
+import shutil
+# --- Speaker Embedding Model ---
+classifier = EncoderClassifier.from_hparams(source="speechbrain/spkrec-xvect-voxceleb", savedir="pretrained_models/spkrec-xvect-voxceleb")
+# --- Vicuna Setup ---
 VICUNA_MODEL_NAME = "lmsys/vicuna-7b-v1.5"
 vicuna_device = "cuda" if torch.cuda.is_available() else "cpu"
 vicuna_tokenizer = AutoTokenizer.from_pretrained(VICUNA_MODEL_NAME)
     device_map="auto",
 )
+# --- Audio Processing Functions ---
+def f2embed(wav_file, classifier, size_embed):
+    signal, fs = stereo_to_mono(wav_file)
+    if signal is None:
+        return None
+    if fs!= 16000:
+        signal, fs = resample_to_16000(signal, fs)
+        if signal is None:
+            return None
+    assert fs == 16000, fs
+    with torch.no_grad():
+        embeddings = classifier.encode_batch(signal)
+        embeddings = F.normalize(embeddings, dim=2)
+        embeddings = embeddings.squeeze().cpu().numpy()
+    assert embeddings.shape == size_embed, embeddings.shape
+    return embeddings
+def stereo_to_mono(wav_file):
+    try:
+        signal, fs = torchaudio.load(wav_file)
+        signal_np = signal.numpy()
+        if signal_np.shape == 2:
+            signal_mono = librosa.to_mono(signal_np)
+            signal_mono = torch.from_numpy(signal_mono).unsqueeze(0)
+        else:
+            signal_mono = signal
+        return signal_mono, fs
+    except Exception as e:
+        print(f"Error in stereo_to_mono: {e}")
+        return None, None
+def resample_to_16000(signal, original_sr):
+    try:
+        signal_np = signal.numpy().flatten()
+        signal_resampled = librosa.resample(signal_np, orig_sr=original_sr, target_sr=16000)
+        signal_resampled = torch.from_numpy(signal_resampled).unsqueeze(0)
+        return signal_resampled, 16000
+    except Exception as e:
+        print(f"Error in resample_to_16000: {e}")
+        return None, None
+def reduce_noise(speech, noise_reduction_amount=0.5):
+    try:
+        denoised_speech = nr.reduce_noise(y=speech, sr=16000, amount=noise_reduction_amount) # Added amount parameter
+        return denoised_speech
+    except Exception as e:
+        print(f"Error in reduce_noise: {e}")
+        return speech
+def process_audio(wav_file, text):
     try:
+        # --- Vicuna Text Processing ---
+        system_prompt = """You are a helpful assistant. Refine or expand the user's text as needed before it is converted to speech. You can correct grammar, add details, or make the text sound more natural."""
+        prompt = f"{system_prompt}\nUser: {text}"
         with torch.no_grad():
+            vicuna_input = vicuna_tokenizer(prompt, return_tensors="pt").to(vicuna_device)
+            vicuna_output = vicuna_model.generate(**vicuna_input, max_new_tokens=256)
+            vicuna_response = vicuna_tokenizer.decode(vicuna_output, skip_special_tokens=True)  # Decode the first element
+            vicuna_processed_text = vicuna_response.replace(prompt, "").strip()
+        print(f"Vicuna processed text: {vicuna_processed_text}")
+        # --- Speaker Embedding Extraction ---
+        speaker_embeddings = f2embed(wav_file, classifier, 512)
+        if speaker_embeddings is None:
+            return None, "Error in speaker embedding extraction"
+        embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)
+        # --- SpeechT5 TTS with Vicuna's output ---
+        processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+        model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+        inputs = processor(text=vicuna_processed_text, return_tensors="pt")
+        inputs.update({"speaker_embeddings": embeddings})
+        vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+        speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=inputs["speaker_embeddings"], vocoder=vocoder)
+        # --- Noise Reduction ---
+        speech_denoised = reduce_noise(speech)
+        return speech_denoised, 16000
     except Exception as e:
+        print(f"Error in process_audio: {e}")
+        return None, f"Error in audio processing: {e}"  # Include the error message
 # --- Gradio Interface ---
+def gradio_interface(wav_file, text):
+    try:
+        if wav_file is None:
+            return "Error: Please upload an audio file."
+        if not text:
+            return "Error: Please enter text to synthesize."
+        processed_audio, rate = process_audio(wav_file, text)
+        if processed_audio is None:
+            return "Error occurred during processing. Check the console for details."
+        return (rate, processed_audio)
+    except Exception as e:
+        print(f"Error in gradio_interface: {e}")
+        return f"An unexpected error occurred: {e}"
+gr_interface = gr.Interface(
+    fn=gradio_interface,
+    inputs=[gr.Audio(type="filepath"), gr.Textbox(lines=2, placeholder="Enter text here...")],
+    outputs=gr.Audio(type="numpy"),
+    title="Text-to-Speech with Speaker Embeddings and Vicuna",
+    description="Upload a speaker audio file and enter text to convert the text to speech using the speaker's voice, enhanced by Vicuna.",
+)
+gr_interface.launch()