Create tts_infer.py
Browse files- lib/tts_infer.py +221 -0
lib/tts_infer.py
ADDED
@@ -0,0 +1,221 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import gc
|
4 |
+
import torch
|
5 |
+
from multiprocessing import cpu_count
|
6 |
+
from lib.modules import VC
|
7 |
+
from lib.split_audio import split_silence_nonsilent, adjust_audio_lengths, combine_silence_nonsilent
|
8 |
+
|
9 |
+
class Configs:
|
10 |
+
def __init__(self, device, is_half):
|
11 |
+
self.device = device
|
12 |
+
self.is_half = is_half
|
13 |
+
self.n_cpu = 0
|
14 |
+
self.gpu_name = None
|
15 |
+
self.gpu_mem = None
|
16 |
+
self.x_pad, self.x_query, self.x_center, self.x_max = self.device_config()
|
17 |
+
|
18 |
+
def device_config(self) -> tuple:
|
19 |
+
if torch.cuda.is_available():
|
20 |
+
i_device = int(self.device.split(":")[-1])
|
21 |
+
self.gpu_name = torch.cuda.get_device_name(i_device)
|
22 |
+
#if (
|
23 |
+
# ("16" in self.gpu_name and "V100" not in self.gpu_name.upper())
|
24 |
+
# or "P40" in self.gpu_name.upper()
|
25 |
+
# or "1060" in self.gpu_name
|
26 |
+
# or "1070" in self.gpu_name
|
27 |
+
# or "1080" in self.gpu_name
|
28 |
+
# ):
|
29 |
+
# print("16 series/10 series P40 forced single precision")
|
30 |
+
# self.is_half = False
|
31 |
+
# for config_file in ["32k.json", "40k.json", "48k.json"]:
|
32 |
+
# with open(BASE_DIR / "src" / "configs" / config_file, "r") as f:
|
33 |
+
# strr = f.read().replace("true", "false")
|
34 |
+
# with open(BASE_DIR / "src" / "configs" / config_file, "w") as f:
|
35 |
+
# f.write(strr)
|
36 |
+
# with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f:
|
37 |
+
# strr = f.read().replace("3.7", "3.0")
|
38 |
+
# with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f:
|
39 |
+
# f.write(strr)
|
40 |
+
# else:
|
41 |
+
# self.gpu_name = None
|
42 |
+
# self.gpu_mem = int(
|
43 |
+
# torch.cuda.get_device_properties(i_device).total_memory
|
44 |
+
# / 1024
|
45 |
+
# / 1024
|
46 |
+
# / 1024
|
47 |
+
# + 0.4
|
48 |
+
# )
|
49 |
+
# if self.gpu_mem <= 4:
|
50 |
+
# with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "r") as f:
|
51 |
+
# strr = f.read().replace("3.7", "3.0")
|
52 |
+
# with open(BASE_DIR / "src" / "trainset_preprocess_pipeline_print.py", "w") as f:
|
53 |
+
# f.write(strr)
|
54 |
+
elif torch.backends.mps.is_available():
|
55 |
+
print("No supported N-card found, use MPS for inference")
|
56 |
+
self.device = "mps"
|
57 |
+
else:
|
58 |
+
print("No supported N-card found, use CPU for inference")
|
59 |
+
self.device = "cpu"
|
60 |
+
|
61 |
+
if self.n_cpu == 0:
|
62 |
+
self.n_cpu = cpu_count()
|
63 |
+
|
64 |
+
if self.is_half:
|
65 |
+
# 6G memory config
|
66 |
+
x_pad = 3
|
67 |
+
x_query = 10
|
68 |
+
x_center = 60
|
69 |
+
x_max = 65
|
70 |
+
else:
|
71 |
+
# 5G memory config
|
72 |
+
x_pad = 1
|
73 |
+
x_query = 6
|
74 |
+
x_center = 38
|
75 |
+
x_max = 41
|
76 |
+
|
77 |
+
if self.gpu_mem != None and self.gpu_mem <= 4:
|
78 |
+
x_pad = 1
|
79 |
+
x_query = 5
|
80 |
+
x_center = 30
|
81 |
+
x_max = 32
|
82 |
+
|
83 |
+
return x_pad, x_query, x_center, x_max
|
84 |
+
|
85 |
+
def get_model(voice_model):
|
86 |
+
model_dir = os.path.join(os.getcwd(), "models", voice_model)
|
87 |
+
model_filename, index_filename = None, None
|
88 |
+
for file in os.listdir(model_dir):
|
89 |
+
ext = os.path.splitext(file)[1]
|
90 |
+
if ext == '.pth':
|
91 |
+
model_filename = file
|
92 |
+
if ext == '.index':
|
93 |
+
index_filename = file
|
94 |
+
|
95 |
+
if model_filename is None:
|
96 |
+
print(f'No model file exists in {models_dir}.')
|
97 |
+
return None, None
|
98 |
+
|
99 |
+
return os.path.join(model_dir, model_filename), os.path.join(model_dir, index_filename) if index_filename else ''
|
100 |
+
|
101 |
+
def infer_audio(
|
102 |
+
model_name,
|
103 |
+
audio_path,
|
104 |
+
f0_change=0,
|
105 |
+
f0_method="rmvpe",
|
106 |
+
min_pitch="50",
|
107 |
+
max_pitch="1100",
|
108 |
+
crepe_hop_length=128,
|
109 |
+
index_rate=0.75,
|
110 |
+
filter_radius=3,
|
111 |
+
rms_mix_rate=0.25,
|
112 |
+
protect=0.33,
|
113 |
+
split_infer=False,
|
114 |
+
min_silence=500,
|
115 |
+
silence_threshold=-50,
|
116 |
+
seek_step=1,
|
117 |
+
keep_silence=100,
|
118 |
+
do_formant=False,
|
119 |
+
quefrency=0,
|
120 |
+
timbre=1,
|
121 |
+
f0_autotune=False,
|
122 |
+
audio_format="wav",
|
123 |
+
resample_sr=0,
|
124 |
+
hubert_model_path="hubert_base.pt",
|
125 |
+
rmvpe_model_path="rmvpe.pt",
|
126 |
+
fcpe_model_path="fcpe.pt"
|
127 |
+
):
|
128 |
+
os.environ["rmvpe_model_path"] = rmvpe_model_path
|
129 |
+
os.environ["fcpe_model_path"] = fcpe_model_path
|
130 |
+
configs = Configs('cuda:0', True)
|
131 |
+
vc = VC(configs)
|
132 |
+
pth_path, index_path = get_model(model_name)
|
133 |
+
vc_data = vc.get_vc(pth_path, protect, 0.5)
|
134 |
+
|
135 |
+
if split_infer:
|
136 |
+
inferred_files = []
|
137 |
+
temp_dir = os.path.join(os.getcwd(), "seperate", "temp")
|
138 |
+
os.makedirs(temp_dir, exist_ok=True)
|
139 |
+
print("Splitting audio to silence and nonsilent segments.")
|
140 |
+
silence_files, nonsilent_files = split_silence_nonsilent(audio_path, min_silence, silence_threshold, seek_step, keep_silence)
|
141 |
+
print(f"Total silence segments: {len(silence_files)}.\nTotal nonsilent segments: {len(nonsilent_files)}.")
|
142 |
+
for i, nonsilent_file in enumerate(nonsilent_files):
|
143 |
+
print(f"Inferring nonsilent audio {i+1}")
|
144 |
+
inference_info, audio_data, output_path = vc.vc_single(
|
145 |
+
0,
|
146 |
+
nonsilent_file,
|
147 |
+
f0_change,
|
148 |
+
f0_method,
|
149 |
+
index_path,
|
150 |
+
index_path,
|
151 |
+
index_rate,
|
152 |
+
filter_radius,
|
153 |
+
resample_sr,
|
154 |
+
rms_mix_rate,
|
155 |
+
protect,
|
156 |
+
audio_format,
|
157 |
+
crepe_hop_length,
|
158 |
+
do_formant,
|
159 |
+
quefrency,
|
160 |
+
timbre,
|
161 |
+
min_pitch,
|
162 |
+
max_pitch,
|
163 |
+
f0_autotune,
|
164 |
+
hubert_model_path
|
165 |
+
)
|
166 |
+
if inference_info[0] == "Success.":
|
167 |
+
print("Inference ran successfully.")
|
168 |
+
print(inference_info[1])
|
169 |
+
print("Times:\nnpy: %.2fs f0: %.2fs infer: %.2fs\nTotal time: %.2fs" % (*inference_info[2],))
|
170 |
+
else:
|
171 |
+
print(f"An error occurred while processing.\n{inference_info[0]}")
|
172 |
+
return None
|
173 |
+
inferred_files.append(output_path)
|
174 |
+
print("Adjusting inferred audio lengths.")
|
175 |
+
adjusted_inferred_files = adjust_audio_lengths(nonsilent_files, inferred_files)
|
176 |
+
print("Combining silence and inferred audios.")
|
177 |
+
output_count = 1
|
178 |
+
while True:
|
179 |
+
output_path = os.path.join(os.getcwd(), "output", f"{os.path.splitext(os.path.basename(audio_path))[0]}{model_name}{f0_method.capitalize()}_{output_count}.{audio_format}")
|
180 |
+
if not os.path.exists(output_path):
|
181 |
+
break
|
182 |
+
output_count += 1
|
183 |
+
output_path = combine_silence_nonsilent(silence_files, adjusted_inferred_files, keep_silence, output_path)
|
184 |
+
[shutil.move(inferred_file, temp_dir) for inferred_file in inferred_files]
|
185 |
+
shutil.rmtree(temp_dir)
|
186 |
+
else:
|
187 |
+
inference_info, audio_data, output_path = vc.vc_single(
|
188 |
+
0,
|
189 |
+
audio_path,
|
190 |
+
f0_change,
|
191 |
+
f0_method,
|
192 |
+
index_path,
|
193 |
+
index_path,
|
194 |
+
index_rate,
|
195 |
+
filter_radius,
|
196 |
+
resample_sr,
|
197 |
+
rms_mix_rate,
|
198 |
+
protect,
|
199 |
+
audio_format,
|
200 |
+
crepe_hop_length,
|
201 |
+
do_formant,
|
202 |
+
quefrency,
|
203 |
+
timbre,
|
204 |
+
min_pitch,
|
205 |
+
max_pitch,
|
206 |
+
f0_autotune,
|
207 |
+
hubert_model_path
|
208 |
+
)
|
209 |
+
if inference_info[0] == "Success.":
|
210 |
+
print("Inference ran successfully.")
|
211 |
+
print(inference_info[1])
|
212 |
+
print("Times:\nnpy: %.2fs f0: %.2fs infer: %.2fs\nTotal time: %.2fs" % (*inference_info[2],))
|
213 |
+
else:
|
214 |
+
print(f"An error occurred while processing.\n{inference_info[0]}")
|
215 |
+
del configs, vc
|
216 |
+
gc.collect()
|
217 |
+
return inference_info[0]
|
218 |
+
|
219 |
+
del configs, vc
|
220 |
+
gc.collect()
|
221 |
+
return output_path
|