Update app.py
Browse files
app.py
CHANGED
@@ -9,7 +9,7 @@ import os
|
|
9 |
from transformers import pipeline, VitsModel, AutoTokenizer
|
10 |
from datasets import load_dataset
|
11 |
|
12 |
-
# For Coqui TTS (XTTS-v2)
|
13 |
try:
|
14 |
from TTS.api import TTS as CoquiTTS
|
15 |
except ImportError:
|
@@ -24,9 +24,10 @@ asr = pipeline(
|
|
24 |
)
|
25 |
|
26 |
# ------------------------------------------------------
|
27 |
-
# 2. Translation Models (
|
28 |
# ------------------------------------------------------
|
29 |
translation_models = {
|
|
|
30 |
"Spanish": "Helsinki-NLP/opus-mt-en-es",
|
31 |
"Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
|
32 |
"Indonesian": "Helsinki-NLP/opus-mt-en-id",
|
@@ -38,6 +39,7 @@ translation_models = {
|
|
38 |
}
|
39 |
|
40 |
translation_tasks = {
|
|
|
41 |
"Spanish": "translation_en_to_es",
|
42 |
"Vietnamese": "translation_en_to_vi",
|
43 |
"Indonesian": "translation_en_to_id",
|
@@ -50,10 +52,11 @@ translation_tasks = {
|
|
50 |
|
51 |
# ------------------------------------------------------
|
52 |
# 3. TTS Configuration
|
53 |
-
# - MMS TTS (VITS) for: Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
|
54 |
# - Coqui XTTS-v2 for: Chinese and Japanese
|
55 |
# ------------------------------------------------------
|
56 |
tts_config = {
|
|
|
57 |
"Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
|
58 |
"Vietnamese": {"model_id": "facebook/mms-tts-vie", "architecture": "vits", "type": "mms"},
|
59 |
"Indonesian": {"model_id": "facebook/mms-tts-ind", "architecture": "vits", "type": "mms"},
|
@@ -64,7 +67,7 @@ tts_config = {
|
|
64 |
"Japanese": {"type": "coqui"}
|
65 |
}
|
66 |
|
67 |
-
# For Coqui,
|
68 |
coqui_lang_map = {
|
69 |
"Chinese": "zh",
|
70 |
"Japanese": "ja"
|
@@ -74,8 +77,8 @@ coqui_lang_map = {
|
|
74 |
# 4. Global Caches for Translators and TTS Models
|
75 |
# ------------------------------------------------------
|
76 |
translator_cache = {}
|
77 |
-
mms_tts_cache = {}
|
78 |
-
coqui_tts_cache = None
|
79 |
|
80 |
# ------------------------------------------------------
|
81 |
# 5. Translator Helper
|
@@ -123,7 +126,6 @@ def load_coqui_tts():
|
|
123 |
if coqui_tts_cache is not None:
|
124 |
return coqui_tts_cache
|
125 |
try:
|
126 |
-
# Set gpu=True if a GPU is available.
|
127 |
coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
128 |
except Exception as e:
|
129 |
raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
|
@@ -131,15 +133,14 @@ def load_coqui_tts():
|
|
131 |
|
132 |
def run_coqui_tts(text, lang):
|
133 |
coqui_tts = load_coqui_tts()
|
134 |
-
lang_code = coqui_lang_map[lang]
|
135 |
-
# Write the output to a temporary file and then read it back.
|
136 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
137 |
tmp_name = tmp.name
|
138 |
try:
|
139 |
coqui_tts.tts_to_file(
|
140 |
text=text,
|
141 |
file_path=tmp_name,
|
142 |
-
language=lang_code
|
143 |
)
|
144 |
data, sr = sf.read(tmp_name)
|
145 |
finally:
|
@@ -169,7 +170,7 @@ def predict(audio, text, target_language):
|
|
169 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
170 |
asr_input = {"array": audio_data, "sampling_rate": 16000}
|
171 |
asr_result = asr(asr_input)
|
172 |
-
english_text = asr_result["text"]
|
173 |
else:
|
174 |
return "No input provided.", "", None
|
175 |
|
@@ -199,7 +200,7 @@ def predict(audio, text, target_language):
|
|
199 |
# 9. Gradio Interface
|
200 |
# ------------------------------------------------------
|
201 |
language_choices = [
|
202 |
-
"Spanish", "Vietnamese", "Indonesian", "Turkish", "Portuguese", "Korean", "Chinese", "Japanese"
|
203 |
]
|
204 |
|
205 |
iface = gr.Interface(
|
@@ -207,7 +208,7 @@ iface = gr.Interface(
|
|
207 |
inputs=[
|
208 |
gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
|
209 |
gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
|
210 |
-
gr.Dropdown(choices=language_choices, value="
|
211 |
],
|
212 |
outputs=[
|
213 |
gr.Textbox(label="English Transcription"),
|
@@ -216,14 +217,16 @@ iface = gr.Interface(
|
|
216 |
],
|
217 |
title="Multimodal Language Learning Aid",
|
218 |
description=(
|
219 |
-
"This app performs the following
|
220 |
-
"1. Transcribes English speech using Wav2Vec2 (
|
221 |
"2. Translates the English text to the target language using Helsinki-NLP models.\n"
|
222 |
-
"3. Provides
|
223 |
-
"For Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean
|
|
|
|
|
224 |
),
|
225 |
allow_flagging="never"
|
226 |
)
|
227 |
|
228 |
if __name__ == "__main__":
|
229 |
-
iface.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
9 |
from transformers import pipeline, VitsModel, AutoTokenizer
|
10 |
from datasets import load_dataset
|
11 |
|
12 |
+
# For Coqui TTS (XTTS-v2) used for Chinese and Japanese
|
13 |
try:
|
14 |
from TTS.api import TTS as CoquiTTS
|
15 |
except ImportError:
|
|
|
24 |
)
|
25 |
|
26 |
# ------------------------------------------------------
|
27 |
+
# 2. Translation Models (9 languages)
|
28 |
# ------------------------------------------------------
|
29 |
translation_models = {
|
30 |
+
"French": "Helsinki-NLP/opus-mt-en-fr",
|
31 |
"Spanish": "Helsinki-NLP/opus-mt-en-es",
|
32 |
"Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
|
33 |
"Indonesian": "Helsinki-NLP/opus-mt-en-id",
|
|
|
39 |
}
|
40 |
|
41 |
translation_tasks = {
|
42 |
+
"French": "translation_en_to_fr",
|
43 |
"Spanish": "translation_en_to_es",
|
44 |
"Vietnamese": "translation_en_to_vi",
|
45 |
"Indonesian": "translation_en_to_id",
|
|
|
52 |
|
53 |
# ------------------------------------------------------
|
54 |
# 3. TTS Configuration
|
55 |
+
# - MMS TTS (VITS) for: French, Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
|
56 |
# - Coqui XTTS-v2 for: Chinese and Japanese
|
57 |
# ------------------------------------------------------
|
58 |
tts_config = {
|
59 |
+
"French": {"model_id": "facebook/mms-tts-fra", "architecture": "vits", "type": "mms"},
|
60 |
"Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
|
61 |
"Vietnamese": {"model_id": "facebook/mms-tts-vie", "architecture": "vits", "type": "mms"},
|
62 |
"Indonesian": {"model_id": "facebook/mms-tts-ind", "architecture": "vits", "type": "mms"},
|
|
|
67 |
"Japanese": {"type": "coqui"}
|
68 |
}
|
69 |
|
70 |
+
# For Coqui, map languages to expected language codes.
|
71 |
coqui_lang_map = {
|
72 |
"Chinese": "zh",
|
73 |
"Japanese": "ja"
|
|
|
77 |
# 4. Global Caches for Translators and TTS Models
|
78 |
# ------------------------------------------------------
|
79 |
translator_cache = {}
|
80 |
+
mms_tts_cache = {}
|
81 |
+
coqui_tts_cache = None
|
82 |
|
83 |
# ------------------------------------------------------
|
84 |
# 5. Translator Helper
|
|
|
126 |
if coqui_tts_cache is not None:
|
127 |
return coqui_tts_cache
|
128 |
try:
|
|
|
129 |
coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
|
130 |
except Exception as e:
|
131 |
raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
|
|
|
133 |
|
134 |
def run_coqui_tts(text, lang):
|
135 |
coqui_tts = load_coqui_tts()
|
136 |
+
lang_code = coqui_lang_map[lang]
|
|
|
137 |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
|
138 |
tmp_name = tmp.name
|
139 |
try:
|
140 |
coqui_tts.tts_to_file(
|
141 |
text=text,
|
142 |
file_path=tmp_name,
|
143 |
+
language=lang_code
|
144 |
)
|
145 |
data, sr = sf.read(tmp_name)
|
146 |
finally:
|
|
|
170 |
audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
|
171 |
asr_input = {"array": audio_data, "sampling_rate": 16000}
|
172 |
asr_result = asr(asr_input)
|
173 |
+
english_text = asr_result["text"].lower()
|
174 |
else:
|
175 |
return "No input provided.", "", None
|
176 |
|
|
|
200 |
# 9. Gradio Interface
|
201 |
# ------------------------------------------------------
|
202 |
language_choices = [
|
203 |
+
"French", "Spanish", "Vietnamese", "Indonesian", "Turkish", "Portuguese", "Korean", "Chinese", "Japanese"
|
204 |
]
|
205 |
|
206 |
iface = gr.Interface(
|
|
|
208 |
inputs=[
|
209 |
gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
|
210 |
gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
|
211 |
+
gr.Dropdown(choices=language_choices, value="French", label="Target Language")
|
212 |
],
|
213 |
outputs=[
|
214 |
gr.Textbox(label="English Transcription"),
|
|
|
217 |
],
|
218 |
title="Multimodal Language Learning Aid",
|
219 |
description=(
|
220 |
+
"This app performs the following tasks:\n"
|
221 |
+
"1. Transcribes English speech using Wav2Vec2 (accepts text input as well).\n"
|
222 |
"2. Translates the English text to the target language using Helsinki-NLP models.\n"
|
223 |
+
"3. Provides speech:\n"
|
224 |
+
" - For French, Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean: uses Facebook MMS TTS (VITS-based).\n"
|
225 |
+
" - For Chinese and Japanese: uses myshell-ai MeloTTS models (work-in-progress).\n"
|
226 |
+
"\nSelect your target language from the dropdown."
|
227 |
),
|
228 |
allow_flagging="never"
|
229 |
)
|
230 |
|
231 |
if __name__ == "__main__":
|
232 |
+
iface.launch(server_name="0.0.0.0", server_port=7860)
|