Yilin0601 commited on
Commit
1ac59b7
·
verified ·
1 Parent(s): f023c8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -18
app.py CHANGED
@@ -9,7 +9,7 @@ import os
9
  from transformers import pipeline, VitsModel, AutoTokenizer
10
  from datasets import load_dataset
11
 
12
- # For Coqui TTS (XTTS-v2)
13
  try:
14
  from TTS.api import TTS as CoquiTTS
15
  except ImportError:
@@ -24,9 +24,10 @@ asr = pipeline(
24
  )
25
 
26
  # ------------------------------------------------------
27
- # 2. Translation Models (8 languages)
28
  # ------------------------------------------------------
29
  translation_models = {
 
30
  "Spanish": "Helsinki-NLP/opus-mt-en-es",
31
  "Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
32
  "Indonesian": "Helsinki-NLP/opus-mt-en-id",
@@ -38,6 +39,7 @@ translation_models = {
38
  }
39
 
40
  translation_tasks = {
 
41
  "Spanish": "translation_en_to_es",
42
  "Vietnamese": "translation_en_to_vi",
43
  "Indonesian": "translation_en_to_id",
@@ -50,10 +52,11 @@ translation_tasks = {
50
 
51
  # ------------------------------------------------------
52
  # 3. TTS Configuration
53
- # - MMS TTS (VITS) for: Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
54
  # - Coqui XTTS-v2 for: Chinese and Japanese
55
  # ------------------------------------------------------
56
  tts_config = {
 
57
  "Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
58
  "Vietnamese": {"model_id": "facebook/mms-tts-vie", "architecture": "vits", "type": "mms"},
59
  "Indonesian": {"model_id": "facebook/mms-tts-ind", "architecture": "vits", "type": "mms"},
@@ -64,7 +67,7 @@ tts_config = {
64
  "Japanese": {"type": "coqui"}
65
  }
66
 
67
- # For Coqui, we map our languages to language codes expected by the model.
68
  coqui_lang_map = {
69
  "Chinese": "zh",
70
  "Japanese": "ja"
@@ -74,8 +77,8 @@ coqui_lang_map = {
74
  # 4. Global Caches for Translators and TTS Models
75
  # ------------------------------------------------------
76
  translator_cache = {}
77
- mms_tts_cache = {} # For MMS (VITS-based) TTS models
78
- coqui_tts_cache = None # Single instance for Coqui XTTS-v2
79
 
80
  # ------------------------------------------------------
81
  # 5. Translator Helper
@@ -123,7 +126,6 @@ def load_coqui_tts():
123
  if coqui_tts_cache is not None:
124
  return coqui_tts_cache
125
  try:
126
- # Set gpu=True if a GPU is available.
127
  coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
128
  except Exception as e:
129
  raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
@@ -131,15 +133,14 @@ def load_coqui_tts():
131
 
132
  def run_coqui_tts(text, lang):
133
  coqui_tts = load_coqui_tts()
134
- lang_code = coqui_lang_map[lang] # "zh" for Chinese or "ja" for Japanese
135
- # Write the output to a temporary file and then read it back.
136
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
137
  tmp_name = tmp.name
138
  try:
139
  coqui_tts.tts_to_file(
140
  text=text,
141
  file_path=tmp_name,
142
- language=lang_code # using default voice; for cloning, add speaker_wav parameter
143
  )
144
  data, sr = sf.read(tmp_name)
145
  finally:
@@ -169,7 +170,7 @@ def predict(audio, text, target_language):
169
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
170
  asr_input = {"array": audio_data, "sampling_rate": 16000}
171
  asr_result = asr(asr_input)
172
- english_text = asr_result["text"]
173
  else:
174
  return "No input provided.", "", None
175
 
@@ -199,7 +200,7 @@ def predict(audio, text, target_language):
199
  # 9. Gradio Interface
200
  # ------------------------------------------------------
201
  language_choices = [
202
- "Spanish", "Vietnamese", "Indonesian", "Turkish", "Portuguese", "Korean", "Chinese", "Japanese"
203
  ]
204
 
205
  iface = gr.Interface(
@@ -207,7 +208,7 @@ iface = gr.Interface(
207
  inputs=[
208
  gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
209
  gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
210
- gr.Dropdown(choices=language_choices, value="Spanish", label="Target Language")
211
  ],
212
  outputs=[
213
  gr.Textbox(label="English Transcription"),
@@ -216,14 +217,16 @@ iface = gr.Interface(
216
  ],
217
  title="Multimodal Language Learning Aid",
218
  description=(
219
- "This app performs the following steps:\n"
220
- "1. Transcribes English speech using Wav2Vec2 (or accepts text input).\n"
221
  "2. Translates the English text to the target language using Helsinki-NLP models.\n"
222
- "3. Provides Synthetic speech:\n"
223
- "For Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean."
 
 
224
  ),
225
  allow_flagging="never"
226
  )
227
 
228
  if __name__ == "__main__":
229
- iface.launch(server_name="0.0.0.0", server_port=7860)
 
9
  from transformers import pipeline, VitsModel, AutoTokenizer
10
  from datasets import load_dataset
11
 
12
+ # For Coqui TTS (XTTS-v2) used for Chinese and Japanese
13
  try:
14
  from TTS.api import TTS as CoquiTTS
15
  except ImportError:
 
24
  )
25
 
26
  # ------------------------------------------------------
27
+ # 2. Translation Models (9 languages)
28
  # ------------------------------------------------------
29
  translation_models = {
30
+ "French": "Helsinki-NLP/opus-mt-en-fr",
31
  "Spanish": "Helsinki-NLP/opus-mt-en-es",
32
  "Vietnamese": "Helsinki-NLP/opus-mt-en-vi",
33
  "Indonesian": "Helsinki-NLP/opus-mt-en-id",
 
39
  }
40
 
41
  translation_tasks = {
42
+ "French": "translation_en_to_fr",
43
  "Spanish": "translation_en_to_es",
44
  "Vietnamese": "translation_en_to_vi",
45
  "Indonesian": "translation_en_to_id",
 
52
 
53
  # ------------------------------------------------------
54
  # 3. TTS Configuration
55
+ # - MMS TTS (VITS) for: French, Spanish, Vietnamese, Indonesian, Turkish, Portuguese, Korean
56
  # - Coqui XTTS-v2 for: Chinese and Japanese
57
  # ------------------------------------------------------
58
  tts_config = {
59
+ "French": {"model_id": "facebook/mms-tts-fra", "architecture": "vits", "type": "mms"},
60
  "Spanish": {"model_id": "facebook/mms-tts-spa", "architecture": "vits", "type": "mms"},
61
  "Vietnamese": {"model_id": "facebook/mms-tts-vie", "architecture": "vits", "type": "mms"},
62
  "Indonesian": {"model_id": "facebook/mms-tts-ind", "architecture": "vits", "type": "mms"},
 
67
  "Japanese": {"type": "coqui"}
68
  }
69
 
70
+ # For Coqui, map languages to expected language codes.
71
  coqui_lang_map = {
72
  "Chinese": "zh",
73
  "Japanese": "ja"
 
77
  # 4. Global Caches for Translators and TTS Models
78
  # ------------------------------------------------------
79
  translator_cache = {}
80
+ mms_tts_cache = {}
81
+ coqui_tts_cache = None
82
 
83
  # ------------------------------------------------------
84
  # 5. Translator Helper
 
126
  if coqui_tts_cache is not None:
127
  return coqui_tts_cache
128
  try:
 
129
  coqui_tts_cache = CoquiTTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
130
  except Exception as e:
131
  raise RuntimeError(f"Failed to load Coqui XTTS-v2 TTS: {e}")
 
133
 
134
  def run_coqui_tts(text, lang):
135
  coqui_tts = load_coqui_tts()
136
+ lang_code = coqui_lang_map[lang]
 
137
  with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
138
  tmp_name = tmp.name
139
  try:
140
  coqui_tts.tts_to_file(
141
  text=text,
142
  file_path=tmp_name,
143
+ language=lang_code
144
  )
145
  data, sr = sf.read(tmp_name)
146
  finally:
 
170
  audio_data = librosa.resample(audio_data, orig_sr=sample_rate, target_sr=16000)
171
  asr_input = {"array": audio_data, "sampling_rate": 16000}
172
  asr_result = asr(asr_input)
173
+ english_text = asr_result["text"].lower()
174
  else:
175
  return "No input provided.", "", None
176
 
 
200
  # 9. Gradio Interface
201
  # ------------------------------------------------------
202
  language_choices = [
203
+ "French", "Spanish", "Vietnamese", "Indonesian", "Turkish", "Portuguese", "Korean", "Chinese", "Japanese"
204
  ]
205
 
206
  iface = gr.Interface(
 
208
  inputs=[
209
  gr.Audio(type="numpy", label="Record/Upload English Audio (optional)"),
210
  gr.Textbox(lines=4, placeholder="Or enter English text here", label="English Text Input (optional)"),
211
+ gr.Dropdown(choices=language_choices, value="French", label="Target Language")
212
  ],
213
  outputs=[
214
  gr.Textbox(label="English Transcription"),
 
217
  ],
218
  title="Multimodal Language Learning Aid",
219
  description=(
220
+ "This app performs the following tasks:\n"
221
+ "1. Transcribes English speech using Wav2Vec2 (accepts text input as well).\n"
222
  "2. Translates the English text to the target language using Helsinki-NLP models.\n"
223
+ "3. Provides speech:\n"
224
+ " - For French, Spanish, Vietnamese, Indonesian, Turkish, Portuguese, and Korean: uses Facebook MMS TTS (VITS-based).\n"
225
+ " - For Chinese and Japanese: uses myshell-ai MeloTTS models (work-in-progress).\n"
226
+ "\nSelect your target language from the dropdown."
227
  ),
228
  allow_flagging="never"
229
  )
230
 
231
  if __name__ == "__main__":
232
+ iface.launch(server_name="0.0.0.0", server_port=7860)