Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -48,11 +48,19 @@ _preload_and_load_models()
|
|
48 |
tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
|
49 |
|
50 |
@spaces.GPU(required=True)
|
51 |
-
def process_audio(microphone, state, answer_mode):
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
|
57 |
You answer questions clearly and simply, using age-appropriate language.
|
58 |
You are also a little bit silly and like to make jokes."""
|
@@ -105,14 +113,22 @@ with gr.Blocks(title="Whisper, Vicuna, & TTS Demo") as demo: # Updated title
|
|
105 |
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
|
106 |
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
|
107 |
with gr.Tab("Transcribe & Synthesize"):
|
108 |
-
|
|
|
|
|
109 |
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
|
110 |
audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
|
111 |
answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
|
112 |
transcription_state = gr.State(value="")
|
|
|
113 |
mic_input.change(
|
114 |
fn=process_audio,
|
115 |
-
inputs=[mic_input, transcription_state, answer_mode],
|
|
|
|
|
|
|
|
|
|
|
116 |
outputs=[transcription_output, transcription_state, audio_output]
|
117 |
)
|
118 |
|
|
|
48 |
tts = Text2Speech.from_pretrained("espnet/kan-bayashi_ljspeech_vits",device='cuda')
|
49 |
|
50 |
@spaces.GPU(required=True)
|
51 |
+
def process_audio(microphone, audio_upload, state, answer_mode): # Added audio_upload
|
52 |
+
audio_source = None
|
53 |
+
if microphone:
|
54 |
+
audio_source = microphone
|
55 |
+
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
|
56 |
+
text = asr_pipe(audio_source)["text"]
|
57 |
+
elif audio_upload:
|
58 |
+
audio_source = audio_upload
|
59 |
+
rate, data = scipy.io.wavfile.read(audio_source)
|
60 |
+
asr_pipe.model.config.forced_decoder_ids = [[2, transcribe_token_id ]]
|
61 |
+
text = asr_pipe(data)["text"]
|
62 |
+
else:
|
63 |
+
return state, state, None # No audio input
|
64 |
system_prompt = """You are a friendly and enthusiastic tutor for young children (ages 6-9).
|
65 |
You answer questions clearly and simply, using age-appropriate language.
|
66 |
You are also a little bit silly and like to make jokes."""
|
|
|
113 |
gr.Markdown("# Speech-to-Text-to-Speech Demo with Vicuna and Hugging Face TTS")
|
114 |
gr.Markdown("Speak into your microphone, get a transcription, Vicuna will process it, and then you'll hear the result!")
|
115 |
with gr.Tab("Transcribe & Synthesize"):
|
116 |
+
with gr.Row(): # Added a row for better layout
|
117 |
+
mic_input = gr.Audio(sources="microphone", type="filepath", label="Speak Here")
|
118 |
+
audio_upload = gr.Audio(sources="upload", type="filepath", label="Or Upload Audio File") # Added upload component
|
119 |
transcription_output = gr.Textbox(lines=5, label="Transcription and Vicuna Response")
|
120 |
audio_output = gr.Audio(label="Synthesized Speech", type="numpy", autoplay=True)
|
121 |
answer_mode = gr.Radio(["fast", "medium", "slow"], value='medium')
|
122 |
transcription_state = gr.State(value="")
|
123 |
+
|
124 |
mic_input.change(
|
125 |
fn=process_audio,
|
126 |
+
inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
|
127 |
+
outputs=[transcription_output, transcription_state, audio_output]
|
128 |
+
)
|
129 |
+
audio_upload.change( # Added change event for upload
|
130 |
+
fn=process_audio,
|
131 |
+
inputs=[mic_input, audio_upload, transcription_state, answer_mode], # Include audio_upload
|
132 |
outputs=[transcription_output, transcription_state, audio_output]
|
133 |
)
|
134 |
|