vicuna-clip

Running on Zero

App Files Files Community

ford442 commited on Feb 10

Commit

f5ebbd5

verified ·

1 Parent(s): 892a58d

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -35

app.py CHANGED Viewed

@@ -1,47 +1,95 @@
 import torch
-from transformers import AutoModelForTextToSpeech, AutoProcessor
-import soundfile as sf  # For saving the audio
 import gradio as gr
-# 1. Choose the model and processor
-model_name = "facebook/fastspeech2-en-ljspeech"
-# 2. Load the processor and model
-processor = AutoProcessor.from_pretrained(model_name)
-model = AutoModelForTextToSpeech.from_pretrained(model_name)
-# 3. Move the model to the GPU (if available)
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = model.to(device)
-# 4. Define a function for text-to-speech
 def synthesize_speech(text):
     try:
-        inputs = processor(text=text, return_tensors="pt")
-        # Move input tensors to the same device as the model
-        inputs = {key: value.to(device) for key, value in inputs.items()}
-        with torch.no_grad():  # Disable gradient calculation during inference
-            output = model(**inputs).waveform
-        # Move to cpu before converting
         output = output.cpu()
-        # Convert the output to a NumPy array (required by soundfile)
         waveform = output.squeeze().numpy()
-        # Return the waveform and the sample rate (needed for Gradio)
-        return (processor.feature_extractor.sampling_rate, waveform)
     except Exception as e:
-      print (e)
-      return (None, None) # in case of error
-# 5. create interface
-iface = gr.Interface(
-    fn=synthesize_speech,
-    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
-    outputs=gr.Audio(label="Generated Speech", type="numpy"),
-    title="FastSpeech2 Text-to-Speech",
-    description="Enter text to synthesize speech using FastSpeech2.",
-)
-# 6. launch
-iface.launch()

 import torch
 import gradio as gr
+from transformers import pipeline, AutoModelForTextToSpeech, AutoProcessor
+import soundfile as sf
+import numpy as np  # Import numpy
+# --- Whisper (ASR) Setup ---
+ASR_MODEL_NAME = "openai/whisper-large-v2"
+asr_device = "cuda" if torch.cuda.is_available() else "cpu"
+asr_pipe = pipeline(
+    task="automatic-speech-recognition",
+    model=ASR_MODEL_NAME,
+    chunk_length_s=30,
+    device=asr_device,
+)
+all_special_ids = asr_pipe.tokenizer.all_special_ids
+transcribe_token_id = all_special_ids[-5]
+translate_token_id = all_special_ids[-6]
+# --- FastSpeech2 (TTS) Setup ---
+TTS_MODEL_NAME = "facebook/fastspeech2-en-ljspeech"
+tts_processor = AutoProcessor.from_pretrained(TTS_MODEL_NAME)
+tts_model = AutoModelForTextToSpeech.from_pretrained(TTS_MODEL_NAME)
+tts_device = "cuda" if torch.cuda.is_available() else "cpu"
+tts_model = tts_model.to(tts_device)
+# --- ASR Function ---
+def transcribe_audio(microphone, state, task="transcribe"):
+    if microphone is None:  # Handle case where no audio is provided
+        return state, state
+    asr_pipe.model.config.forced_decoder_ids = [
+        [2, transcribe_token_id if task == "transcribe" else translate_token_id]
+    ]
+    text = asr_pipe(microphone)["text"]
+    updated_state = state + "\n" + text
+    return updated_state, updated_state
+# --- TTS Function ---
 def synthesize_speech(text):
     try:
+        inputs = tts_processor(text=text, return_tensors="pt")
+        inputs = {key: value.to(tts_device) for key, value in inputs.items()}
+        with torch.no_grad():
+            output = tts_model(**inputs).waveform
         output = output.cpu()
         waveform = output.squeeze().numpy()
+        return (tts_processor.feature_extractor.sampling_rate, waveform)
     except Exception as e:
+        print(e)
+        return (None, None)
+# --- Gradio Interface ---
+with gr.Blocks(title="Whisper & FastSpeech2 Demo") as demo:
+    gr.Markdown("# Speech-to-Text-to-Speech Demo")
+    gr.Markdown("Speak into your microphone, get a transcription, and then hear it spoken back!")
+    with gr.Tab("Transcribe"):
+        mic_input = gr.Audio(source="microphone", type="filepath", optional=True)
+        transcription_output = gr.Textbox(lines=5, label="Transcription")
+        transcription_state = gr.State(value="")  # State to accumulate transcription
+        transcribe_btn = gr.Button("Transcribe")
+        transcribe_btn.click(
+            fn=transcribe_audio,
+            inputs=[mic_input, transcription_state],
+            outputs=[transcription_output, transcription_state],
+        )
+    with gr.Tab("Synthesize"):
+        text_input = gr.Textbox(lines=5, label="Text to Speak", placeholder="Enter text here...")
+        audio_output = gr.Audio(label="Generated Speech", type="numpy")
+        synthesize_btn = gr.Button("Synthesize")
+        synthesize_btn.click(
+            fn=synthesize_speech,
+            inputs=text_input,
+            outputs=audio_output,
+        )
+    with gr.Tab("Combined"):
+      # combined interface.  Speak to transcribe, auto synthesize
+        mic_input_c = gr.Audio(source="microphone", type="filepath", optional=True, label="Speak Here")
+        transcription_output_c = gr.Textbox(lines=5, label="Transcription")
+        audio_output_c = gr.Audio(label="Synthesized Speech", type="numpy")
+        transcription_state_c = gr.State(value="")  # State to accumulate transcription
+        #transcribe and output audio
+        mic_input_c.change(
+            fn=transcribe_audio,
+            inputs=[mic_input_c, transcription_state_c],
+            outputs=[transcription_output_c, transcription_state_c]
+        ).then(
+            fn=synthesize_speech,
+            inputs=transcription_output_c,
+            outputs=audio_output_c
+        )
+demo.launch(enable_queue=True)