Hev832 commited on
Commit
199aa90
·
verified ·
1 Parent(s): f66bd89

Create tts_infer.py

Browse files
Files changed (1) hide show
  1. lib/tts_infer.py +221 -0
lib/tts_infer.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import gc
4
+ import torch
5
+ from multiprocessing import cpu_count
6
+ from lib.modules import VC
7
+ from lib.split_audio import split_silence_nonsilent, adjust_audio_lengths, combine_silence_nonsilent
8
+
9
+ class Configs:
10
+ def __init__(self, device, is_half):
11
+ self.device = device
12
+ self.is_half = is_half
13
+ self.n_cpu = 0
14
+ self.gpu_name = None
15
+ self.gpu_mem = None
16
+ self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
17
+
18
+ def device_config(self) -> tuple:
19
+ if torch.cuda.is_available():
20
+ i_device = int(self.device.split(":")[-1])
21
+ self.gpu_name = torch.cuda.get_device_name(i_device)
22
+ #if (
23
+ # ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
24
+ # or "P40" in self.gpu_name.upper()
25
+ # or "1060" in self.gpu_name
26
+ # or "1070" in self.gpu_name
27
+ # or "1080" in self.gpu_name
28
+ # ):
29
+ # print("16 series/10 series P40 forced single precision")
30
+ # self.is_half = False
31
+ # for config_file in ["32k.json", "40k.json", "48k.json"]:
32
+ # with open(BASE_DIR / "src" / "configs" / config_file, "r") as f:
33
+ # strr = f.read().replace("true", "false")
34
+ # with open(BASE_DIR / "src" / "configs" / config_file, "w") as f:
35
+ # f.write(strr)
36
+ # with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f:
37
+ # strr = f.read().replace("3.7", "3.0")
38
+ # with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f:
39
+ # f.write(strr)
40
+ # else:
41
+ # self.gpu_name = None
42
+ # self.gpu_mem = int(
43
+ # torch.cuda.get_device_properties(i_device).total_memory
44
+ # / 1024
45
+ # / 1024
46
+ # / 1024
47
+ # + 0.4
48
+ # )
49
+ # if self.gpu_mem <= 4:
50
+ # with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f:
51
+ # strr = f.read().replace("3.7", "3.0")
52
+ # with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f:
53
+ # f.write(strr)
54
+ elif torch.backends.mps.is_available():
55
+ print("No supported N-card found, use MPS for inference")
56
+ self.device = "mps"
57
+ else:
58
+ print("No supported N-card found, use CPU for inference")
59
+ self.device = "cpu"
60
+
61
+ if self.n_cpu == 0:
62
+ self.n_cpu = cpu_count()
63
+
64
+ if self.is_half:
65
+ # 6G memory config
66
+ x_pad = 3
67
+ x_query = 10
68
+ x_center = 60
69
+ x_max = 65
70
+ else:
71
+ # 5G memory config
72
+ x_pad = 1
73
+ x_query = 6
74
+ x_center = 38
75
+ x_max = 41
76
+
77
+ if self.gpu_mem != None and self.gpu_mem <= 4:
78
+ x_pad = 1
79
+ x_query = 5
80
+ x_center = 30
81
+ x_max = 32
82
+
83
+ return x_pad, x_query, x_center, x_max
84
+
85
+ def get_model(voice_model):
86
+ model_dir = os.path.join(os.getcwd(), "models", voice_model)
87
+ model_filename, index_filename = None, None
88
+ for file in os.listdir(model_dir):
89
+ ext = os.path.splitext(file)[1]
90
+ if ext == '.pth':
91
+ model_filename = file
92
+ if ext == '.index':
93
+ index_filename = file
94
+
95
+ if model_filename is None:
96
+ print(f'No model file exists in {models_dir}.')
97
+ return None, None
98
+
99
+ return os.path.join(model_dir, model_filename), os.path.join(model_dir, index_filename) if index_filename else ''
100
+
101
+ def infer_audio(
102
+ model_name,
103
+ audio_path,
104
+ f0_change=0,
105
+ f0_method="rmvpe",
106
+ min_pitch="50",
107
+ max_pitch="1100",
108
+ crepe_hop_length=128,
109
+ index_rate=0.75,
110
+ filter_radius=3,
111
+ rms_mix_rate=0.25,
112
+ protect=0.33,
113
+ split_infer=False,
114
+ min_silence=500,
115
+ silence_threshold=-50,
116
+ seek_step=1,
117
+ keep_silence=100,
118
+ do_formant=False,
119
+ quefrency=0,
120
+ timbre=1,
121
+ f0_autotune=False,
122
+ audio_format="wav",
123
+ resample_sr=0,
124
+ hubert_model_path="hubert_base.pt",
125
+ rmvpe_model_path="rmvpe.pt",
126
+ fcpe_model_path="fcpe.pt"
127
+ ):
128
+ os.environ["rmvpe_model_path"] = rmvpe_model_path
129
+ os.environ["fcpe_model_path"] = fcpe_model_path
130
+ configs = Configs('cuda:0', True)
131
+ vc = VC(configs)
132
+ pth_path, index_path = get_model(model_name)
133
+ vc_data = vc.get_vc(pth_path, protect, 0.5)
134
+
135
+ if split_infer:
136
+ inferred_files = []
137
+ temp_dir = os.path.join(os.getcwd(), "seperate", "temp")
138
+ os.makedirs(temp_dir, exist_ok=True)
139
+ print("Splitting audio to silence and nonsilent segments.")
140
+ silence_files, nonsilent_files = split_silence_nonsilent(audio_path, min_silence, silence_threshold, seek_step, keep_silence)
141
+ print(f"Total silence segments: {len(silence_files)}.\nTotal nonsilent segments: {len(nonsilent_files)}.")
142
+ for i, nonsilent_file in enumerate(nonsilent_files):
143
+ print(f"Inferring nonsilent audio {i+1}")
144
+ inference_info, audio_data, output_path = vc.vc_single(
145
+ 0,
146
+ nonsilent_file,
147
+ f0_change,
148
+ f0_method,
149
+ index_path,
150
+ index_path,
151
+ index_rate,
152
+ filter_radius,
153
+ resample_sr,
154
+ rms_mix_rate,
155
+ protect,
156
+ audio_format,
157
+ crepe_hop_length,
158
+ do_formant,
159
+ quefrency,
160
+ timbre,
161
+ min_pitch,
162
+ max_pitch,
163
+ f0_autotune,
164
+ hubert_model_path
165
+ )
166
+ if inference_info[0] == "Success.":
167
+ print("Inference ran successfully.")
168
+ print(inference_info[1])
169
+ print("Times:\nnpy: %.2fs f0: %.2fs infer: %.2fs\nTotal time: %.2fs" % (*inference_info[2],))
170
+ else:
171
+ print(f"An error occurred while processing.\n{inference_info[0]}")
172
+ return None
173
+ inferred_files.append(output_path)
174
+ print("Adjusting inferred audio lengths.")
175
+ adjusted_inferred_files = adjust_audio_lengths(nonsilent_files, inferred_files)
176
+ print("Combining silence and inferred audios.")
177
+ output_count = 1
178
+ while True:
179
+ output_path = os.path.join(os.getcwd(), "output", f"{os.path.splitext(os.path.basename(audio_path))[0]}{model_name}{f0_method.capitalize()}_{output_count}.{audio_format}")
180
+ if not os.path.exists(output_path):
181
+ break
182
+ output_count += 1
183
+ output_path = combine_silence_nonsilent(silence_files, adjusted_inferred_files, keep_silence, output_path)
184
+ [shutil.move(inferred_file, temp_dir) for inferred_file in inferred_files]
185
+ shutil.rmtree(temp_dir)
186
+ else:
187
+ inference_info, audio_data, output_path = vc.vc_single(
188
+ 0,
189
+ audio_path,
190
+ f0_change,
191
+ f0_method,
192
+ index_path,
193
+ index_path,
194
+ index_rate,
195
+ filter_radius,
196
+ resample_sr,
197
+ rms_mix_rate,
198
+ protect,
199
+ audio_format,
200
+ crepe_hop_length,
201
+ do_formant,
202
+ quefrency,
203
+ timbre,
204
+ min_pitch,
205
+ max_pitch,
206
+ f0_autotune,
207
+ hubert_model_path
208
+ )
209
+ if inference_info[0] == "Success.":
210
+ print("Inference ran successfully.")
211
+ print(inference_info[1])
212
+ print("Times:\nnpy: %.2fs f0: %.2fs infer: %.2fs\nTotal time: %.2fs" % (*inference_info[2],))
213
+ else:
214
+ print(f"An error occurred while processing.\n{inference_info[0]}")
215
+ del configs, vc
216
+ gc.collect()
217
+ return inference_info[0]
218
+
219
+ del configs, vc
220
+ gc.collect()
221
+ return output_path