Eempostor commited on
Commit
ce3f7f0
·
verified ·
1 Parent(s): a059a91

Delete lib/infer_libs/infer_pack/models.py

Browse files
Files changed (1) hide show
  1. lib/infer_libs/infer_pack/models.py +0 -1174
lib/infer_libs/infer_pack/models.py DELETED
@@ -1,1174 +0,0 @@
1
- import math
2
- import logging
3
-
4
- logger = logging.getLogger(__name__)
5
-
6
- import numpy as np
7
- import torch
8
- from torch import nn
9
- from torch.nn import Conv1d, Conv2d, ConvTranspose1d
10
- from torch.nn import functional as F
11
- from torch.nn.utils import remove_weight_norm, spectral_norm, weight_norm
12
-
13
- from lib.infer.infer_libs.infer_pack import attentions, commons, modules
14
- from lib.infer.infer_libs.infer_pack.commons import get_padding, init_weights
15
- has_xpu = bool(hasattr(torch, "xpu") and torch.xpu.is_available())
16
-
17
- class TextEncoder256(nn.Module):
18
- def __init__(
19
- self,
20
- out_channels,
21
- hidden_channels,
22
- filter_channels,
23
- n_heads,
24
- n_layers,
25
- kernel_size,
26
- p_dropout,
27
- f0=True,
28
- ):
29
- super().__init__()
30
- self.out_channels = out_channels
31
- self.hidden_channels = hidden_channels
32
- self.filter_channels = filter_channels
33
- self.n_heads = n_heads
34
- self.n_layers = n_layers
35
- self.kernel_size = kernel_size
36
- self.p_dropout = p_dropout
37
- self.emb_phone = nn.Linear(256, hidden_channels)
38
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
39
- if f0 == True:
40
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
41
- self.encoder = attentions.Encoder(
42
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
43
- )
44
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
45
-
46
- def forward(self, phone, pitch, lengths):
47
- if pitch == None:
48
- x = self.emb_phone(phone)
49
- else:
50
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
51
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
52
- x = self.lrelu(x)
53
- x = torch.transpose(x, 1, -1) # [b, h, t]
54
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
55
- x.dtype
56
- )
57
- x = self.encoder(x * x_mask, x_mask)
58
- stats = self.proj(x) * x_mask
59
-
60
- m, logs = torch.split(stats, self.out_channels, dim=1)
61
- return m, logs, x_mask
62
-
63
-
64
- class TextEncoder768(nn.Module):
65
- def __init__(
66
- self,
67
- out_channels,
68
- hidden_channels,
69
- filter_channels,
70
- n_heads,
71
- n_layers,
72
- kernel_size,
73
- p_dropout,
74
- f0=True,
75
- ):
76
- super().__init__()
77
- self.out_channels = out_channels
78
- self.hidden_channels = hidden_channels
79
- self.filter_channels = filter_channels
80
- self.n_heads = n_heads
81
- self.n_layers = n_layers
82
- self.kernel_size = kernel_size
83
- self.p_dropout = p_dropout
84
- self.emb_phone = nn.Linear(768, hidden_channels)
85
- self.lrelu = nn.LeakyReLU(0.1, inplace=True)
86
- if f0 == True:
87
- self.emb_pitch = nn.Embedding(256, hidden_channels) # pitch 256
88
- self.encoder = attentions.Encoder(
89
- hidden_channels, filter_channels, n_heads, n_layers, kernel_size, p_dropout
90
- )
91
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
92
-
93
- def forward(self, phone, pitch, lengths):
94
- if pitch == None:
95
- x = self.emb_phone(phone)
96
- else:
97
- x = self.emb_phone(phone) + self.emb_pitch(pitch)
98
- x = x * math.sqrt(self.hidden_channels) # [b, t, h]
99
- x = self.lrelu(x)
100
- x = torch.transpose(x, 1, -1) # [b, h, t]
101
- x_mask = torch.unsqueeze(commons.sequence_mask(lengths, x.size(2)), 1).to(
102
- x.dtype
103
- )
104
- x = self.encoder(x * x_mask, x_mask)
105
- stats = self.proj(x) * x_mask
106
-
107
- m, logs = torch.split(stats, self.out_channels, dim=1)
108
- return m, logs, x_mask
109
-
110
-
111
- class ResidualCouplingBlock(nn.Module):
112
- def __init__(
113
- self,
114
- channels,
115
- hidden_channels,
116
- kernel_size,
117
- dilation_rate,
118
- n_layers,
119
- n_flows=4,
120
- gin_channels=0,
121
- ):
122
- super().__init__()
123
- self.channels = channels
124
- self.hidden_channels = hidden_channels
125
- self.kernel_size = kernel_size
126
- self.dilation_rate = dilation_rate
127
- self.n_layers = n_layers
128
- self.n_flows = n_flows
129
- self.gin_channels = gin_channels
130
-
131
- self.flows = nn.ModuleList()
132
- for i in range(n_flows):
133
- self.flows.append(
134
- modules.ResidualCouplingLayer(
135
- channels,
136
- hidden_channels,
137
- kernel_size,
138
- dilation_rate,
139
- n_layers,
140
- gin_channels=gin_channels,
141
- mean_only=True,
142
- )
143
- )
144
- self.flows.append(modules.Flip())
145
-
146
- def forward(self, x, x_mask, g=None, reverse=False):
147
- if not reverse:
148
- for flow in self.flows:
149
- x, _ = flow(x, x_mask, g=g, reverse=reverse)
150
- else:
151
- for flow in reversed(self.flows):
152
- x = flow(x, x_mask, g=g, reverse=reverse)
153
- return x
154
-
155
- def remove_weight_norm(self):
156
- for i in range(self.n_flows):
157
- self.flows[i * 2].remove_weight_norm()
158
-
159
-
160
- class PosteriorEncoder(nn.Module):
161
- def __init__(
162
- self,
163
- in_channels,
164
- out_channels,
165
- hidden_channels,
166
- kernel_size,
167
- dilation_rate,
168
- n_layers,
169
- gin_channels=0,
170
- ):
171
- super().__init__()
172
- self.in_channels = in_channels
173
- self.out_channels = out_channels
174
- self.hidden_channels = hidden_channels
175
- self.kernel_size = kernel_size
176
- self.dilation_rate = dilation_rate
177
- self.n_layers = n_layers
178
- self.gin_channels = gin_channels
179
-
180
- self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
181
- self.enc = modules.WN(
182
- hidden_channels,
183
- kernel_size,
184
- dilation_rate,
185
- n_layers,
186
- gin_channels=gin_channels,
187
- )
188
- self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
189
-
190
- def forward(self, x, x_lengths, g=None):
191
- x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
192
- x.dtype
193
- )
194
- x = self.pre(x) * x_mask
195
- x = self.enc(x, x_mask, g=g)
196
- stats = self.proj(x) * x_mask
197
- m, logs = torch.split(stats, self.out_channels, dim=1)
198
- z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
199
- return z, m, logs, x_mask
200
-
201
- def remove_weight_norm(self):
202
- self.enc.remove_weight_norm()
203
-
204
-
205
- class Generator(torch.nn.Module):
206
- def __init__(
207
- self,
208
- initial_channel,
209
- resblock,
210
- resblock_kernel_sizes,
211
- resblock_dilation_sizes,
212
- upsample_rates,
213
- upsample_initial_channel,
214
- upsample_kernel_sizes,
215
- gin_channels=0,
216
- ):
217
- super(Generator, self).__init__()
218
- self.num_kernels = len(resblock_kernel_sizes)
219
- self.num_upsamples = len(upsample_rates)
220
- self.conv_pre = Conv1d(
221
- initial_channel, upsample_initial_channel, 7, 1, padding=3
222
- )
223
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
224
-
225
- self.ups = nn.ModuleList()
226
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
227
- self.ups.append(
228
- weight_norm(
229
- ConvTranspose1d(
230
- upsample_initial_channel // (2**i),
231
- upsample_initial_channel // (2 ** (i + 1)),
232
- k,
233
- u,
234
- padding=(k - u) // 2,
235
- )
236
- )
237
- )
238
-
239
- self.resblocks = nn.ModuleList()
240
- for i in range(len(self.ups)):
241
- ch = upsample_initial_channel // (2 ** (i + 1))
242
- for j, (k, d) in enumerate(
243
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
244
- ):
245
- self.resblocks.append(resblock(ch, k, d))
246
-
247
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
248
- self.ups.apply(init_weights)
249
-
250
- if gin_channels != 0:
251
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
252
-
253
- def forward(self, x, g=None):
254
- x = self.conv_pre(x)
255
- if g is not None:
256
- x = x + self.cond(g)
257
-
258
- for i in range(self.num_upsamples):
259
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
260
- x = self.ups[i](x)
261
- xs = None
262
- for j in range(self.num_kernels):
263
- if xs is None:
264
- xs = self.resblocks[i * self.num_kernels + j](x)
265
- else:
266
- xs += self.resblocks[i * self.num_kernels + j](x)
267
- x = xs / self.num_kernels
268
- x = F.leaky_relu(x)
269
- x = self.conv_post(x)
270
- x = torch.tanh(x)
271
-
272
- return x
273
-
274
- def remove_weight_norm(self):
275
- for l in self.ups:
276
- remove_weight_norm(l)
277
- for l in self.resblocks:
278
- l.remove_weight_norm()
279
-
280
-
281
- class SineGen(torch.nn.Module):
282
- """Definition of sine generator
283
- SineGen(samp_rate, harmonic_num = 0,
284
- sine_amp = 0.1, noise_std = 0.003,
285
- voiced_threshold = 0,
286
- flag_for_pulse=False)
287
- samp_rate: sampling rate in Hz
288
- harmonic_num: number of harmonic overtones (default 0)
289
- sine_amp: amplitude of sine-wavefrom (default 0.1)
290
- noise_std: std of Gaussian noise (default 0.003)
291
- voiced_thoreshold: F0 threshold for U/V classification (default 0)
292
- flag_for_pulse: this SinGen is used inside PulseGen (default False)
293
- Note: when flag_for_pulse is True, the first time step of a voiced
294
- segment is always sin(np.pi) or cos(0)
295
- """
296
-
297
- def __init__(
298
- self,
299
- samp_rate,
300
- harmonic_num=0,
301
- sine_amp=0.1,
302
- noise_std=0.003,
303
- voiced_threshold=0,
304
- flag_for_pulse=False,
305
- ):
306
- super(SineGen, self).__init__()
307
- self.sine_amp = sine_amp
308
- self.noise_std = noise_std
309
- self.harmonic_num = harmonic_num
310
- self.dim = self.harmonic_num + 1
311
- self.sampling_rate = samp_rate
312
- self.voiced_threshold = voiced_threshold
313
-
314
- def _f02uv(self, f0):
315
- # generate uv signal
316
- uv = torch.ones_like(f0)
317
- uv = uv * (f0 > self.voiced_threshold)
318
- if uv.device.type == "privateuseone": # for DirectML
319
- uv = uv.float()
320
- return uv
321
-
322
- def forward(self, f0, upp):
323
- """sine_tensor, uv = forward(f0)
324
- input F0: tensor(batchsize=1, length, dim=1)
325
- f0 for unvoiced steps should be 0
326
- output sine_tensor: tensor(batchsize=1, length, dim)
327
- output uv: tensor(batchsize=1, length, 1)
328
- """
329
- with torch.no_grad():
330
- f0 = f0[:, None].transpose(1, 2)
331
- f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
332
- # fundamental component
333
- f0_buf[:, :, 0] = f0[:, :, 0]
334
- for idx in np.arange(self.harmonic_num):
335
- f0_buf[:, :, idx + 1] = f0_buf[:, :, 0] * (
336
- idx + 2
337
- ) # idx + 2: the (idx+1)-th overtone, (idx+2)-th harmonic
338
- rad_values = (f0_buf / self.sampling_rate) % 1 ###%1意味着n_har的乘积无法后处理优化
339
- rand_ini = torch.rand(
340
- f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
341
- )
342
- rand_ini[:, 0] = 0
343
- rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
344
- tmp_over_one = torch.cumsum(rad_values, 1) # % 1 #####%1意味着后面的cumsum无法再优化
345
- tmp_over_one *= upp
346
- tmp_over_one = F.interpolate(
347
- tmp_over_one.transpose(2, 1),
348
- scale_factor=upp,
349
- mode="linear",
350
- align_corners=True,
351
- ).transpose(2, 1)
352
- rad_values = F.interpolate(
353
- rad_values.transpose(2, 1), scale_factor=upp, mode="nearest"
354
- ).transpose(
355
- 2, 1
356
- ) #######
357
- tmp_over_one %= 1
358
- tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
359
- cumsum_shift = torch.zeros_like(rad_values)
360
- cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
361
- sine_waves = torch.sin(
362
- torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * np.pi
363
- )
364
- sine_waves = sine_waves * self.sine_amp
365
- uv = self._f02uv(f0)
366
- uv = F.interpolate(
367
- uv.transpose(2, 1), scale_factor=upp, mode="nearest"
368
- ).transpose(2, 1)
369
- noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
370
- noise = noise_amp * torch.randn_like(sine_waves)
371
- sine_waves = sine_waves * uv + noise
372
- return sine_waves, uv, noise
373
-
374
-
375
- class SourceModuleHnNSF(torch.nn.Module):
376
- """SourceModule for hn-nsf
377
- SourceModule(sampling_rate, harmonic_num=0, sine_amp=0.1,
378
- add_noise_std=0.003, voiced_threshod=0)
379
- sampling_rate: sampling_rate in Hz
380
- harmonic_num: number of harmonic above F0 (default: 0)
381
- sine_amp: amplitude of sine source signal (default: 0.1)
382
- add_noise_std: std of additive Gaussian noise (default: 0.003)
383
- note that amplitude of noise in unvoiced is decided
384
- by sine_amp
385
- voiced_threshold: threhold to set U/V given F0 (default: 0)
386
- Sine_source, noise_source = SourceModuleHnNSF(F0_sampled)
387
- F0_sampled (batchsize, length, 1)
388
- Sine_source (batchsize, length, 1)
389
- noise_source (batchsize, length 1)
390
- uv (batchsize, length, 1)
391
- """
392
-
393
- def __init__(
394
- self,
395
- sampling_rate,
396
- harmonic_num=0,
397
- sine_amp=0.1,
398
- add_noise_std=0.003,
399
- voiced_threshod=0,
400
- is_half=True,
401
- ):
402
- super(SourceModuleHnNSF, self).__init__()
403
-
404
- self.sine_amp = sine_amp
405
- self.noise_std = add_noise_std
406
- self.is_half = is_half
407
- # to produce sine waveforms
408
- self.l_sin_gen = SineGen(
409
- sampling_rate, harmonic_num, sine_amp, add_noise_std, voiced_threshod
410
- )
411
-
412
- # to merge source harmonics into a single excitation
413
- self.l_linear = torch.nn.Linear(harmonic_num + 1, 1)
414
- self.l_tanh = torch.nn.Tanh()
415
-
416
- def forward(self, x, upp=None):
417
- if hasattr(self, "ddtype") == False:
418
- self.ddtype = self.l_linear.weight.dtype
419
- sine_wavs, uv, _ = self.l_sin_gen(x, upp)
420
- # print(x.dtype,sine_wavs.dtype,self.l_linear.weight.dtype)
421
- # if self.is_half:
422
- # sine_wavs = sine_wavs.half()
423
- # sine_merge = self.l_tanh(self.l_linear(sine_wavs.to(x)))
424
- # print(sine_wavs.dtype,self.ddtype)
425
- if sine_wavs.dtype != self.ddtype:
426
- sine_wavs = sine_wavs.to(self.ddtype)
427
- sine_merge = self.l_tanh(self.l_linear(sine_wavs))
428
- return sine_merge, None, None # noise, uv
429
-
430
-
431
- class GeneratorNSF(torch.nn.Module):
432
- def __init__(
433
- self,
434
- initial_channel,
435
- resblock,
436
- resblock_kernel_sizes,
437
- resblock_dilation_sizes,
438
- upsample_rates,
439
- upsample_initial_channel,
440
- upsample_kernel_sizes,
441
- gin_channels,
442
- sr,
443
- is_half=False,
444
- ):
445
- super(GeneratorNSF, self).__init__()
446
- self.num_kernels = len(resblock_kernel_sizes)
447
- self.num_upsamples = len(upsample_rates)
448
-
449
- self.f0_upsamp = torch.nn.Upsample(scale_factor=np.prod(upsample_rates))
450
- self.m_source = SourceModuleHnNSF(
451
- sampling_rate=sr, harmonic_num=0, is_half=is_half
452
- )
453
- self.noise_convs = nn.ModuleList()
454
- self.conv_pre = Conv1d(
455
- initial_channel, upsample_initial_channel, 7, 1, padding=3
456
- )
457
- resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
458
-
459
- self.ups = nn.ModuleList()
460
- for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
461
- c_cur = upsample_initial_channel // (2 ** (i + 1))
462
- self.ups.append(
463
- weight_norm(
464
- ConvTranspose1d(
465
- upsample_initial_channel // (2**i),
466
- upsample_initial_channel // (2 ** (i + 1)),
467
- k,
468
- u,
469
- padding=(k - u) // 2,
470
- )
471
- )
472
- )
473
- if i + 1 < len(upsample_rates):
474
- stride_f0 = np.prod(upsample_rates[i + 1 :])
475
- self.noise_convs.append(
476
- Conv1d(
477
- 1,
478
- c_cur,
479
- kernel_size=stride_f0 * 2,
480
- stride=stride_f0,
481
- padding=stride_f0 // 2,
482
- )
483
- )
484
- else:
485
- self.noise_convs.append(Conv1d(1, c_cur, kernel_size=1))
486
-
487
- self.resblocks = nn.ModuleList()
488
- for i in range(len(self.ups)):
489
- ch = upsample_initial_channel // (2 ** (i + 1))
490
- for j, (k, d) in enumerate(
491
- zip(resblock_kernel_sizes, resblock_dilation_sizes)
492
- ):
493
- self.resblocks.append(resblock(ch, k, d))
494
-
495
- self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
496
- self.ups.apply(init_weights)
497
-
498
- if gin_channels != 0:
499
- self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
500
-
501
- self.upp = np.prod(upsample_rates)
502
-
503
- def forward(self, x, f0, g=None):
504
- har_source, noi_source, uv = self.m_source(f0, self.upp)
505
- har_source = har_source.transpose(1, 2)
506
- x = self.conv_pre(x)
507
- if g is not None:
508
- x = x + self.cond(g)
509
-
510
- for i in range(self.num_upsamples):
511
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
512
- x = self.ups[i](x)
513
- x_source = self.noise_convs[i](har_source)
514
- x = x + x_source
515
- xs = None
516
- for j in range(self.num_kernels):
517
- if xs is None:
518
- xs = self.resblocks[i * self.num_kernels + j](x)
519
- else:
520
- xs += self.resblocks[i * self.num_kernels + j](x)
521
- x = xs / self.num_kernels
522
- x = F.leaky_relu(x)
523
- x = self.conv_post(x)
524
- x = torch.tanh(x)
525
- return x
526
-
527
- def remove_weight_norm(self):
528
- for l in self.ups:
529
- remove_weight_norm(l)
530
- for l in self.resblocks:
531
- l.remove_weight_norm()
532
-
533
-
534
- sr2sr = {
535
- "32k": 32000,
536
- "40k": 40000,
537
- "48k": 48000,
538
- }
539
-
540
-
541
- class SynthesizerTrnMs256NSFsid(nn.Module):
542
- def __init__(
543
- self,
544
- spec_channels,
545
- segment_size,
546
- inter_channels,
547
- hidden_channels,
548
- filter_channels,
549
- n_heads,
550
- n_layers,
551
- kernel_size,
552
- p_dropout,
553
- resblock,
554
- resblock_kernel_sizes,
555
- resblock_dilation_sizes,
556
- upsample_rates,
557
- upsample_initial_channel,
558
- upsample_kernel_sizes,
559
- spk_embed_dim,
560
- gin_channels,
561
- sr,
562
- **kwargs
563
- ):
564
- super().__init__()
565
- if type(sr) == type("strr"):
566
- sr = sr2sr[sr]
567
- self.spec_channels = spec_channels
568
- self.inter_channels = inter_channels
569
- self.hidden_channels = hidden_channels
570
- self.filter_channels = filter_channels
571
- self.n_heads = n_heads
572
- self.n_layers = n_layers
573
- self.kernel_size = kernel_size
574
- self.p_dropout = p_dropout
575
- self.resblock = resblock
576
- self.resblock_kernel_sizes = resblock_kernel_sizes
577
- self.resblock_dilation_sizes = resblock_dilation_sizes
578
- self.upsample_rates = upsample_rates
579
- self.upsample_initial_channel = upsample_initial_channel
580
- self.upsample_kernel_sizes = upsample_kernel_sizes
581
- self.segment_size = segment_size
582
- self.gin_channels = gin_channels
583
- # self.hop_length = hop_length#
584
- self.spk_embed_dim = spk_embed_dim
585
- self.enc_p = TextEncoder256(
586
- inter_channels,
587
- hidden_channels,
588
- filter_channels,
589
- n_heads,
590
- n_layers,
591
- kernel_size,
592
- p_dropout,
593
- )
594
- self.dec = GeneratorNSF(
595
- inter_channels,
596
- resblock,
597
- resblock_kernel_sizes,
598
- resblock_dilation_sizes,
599
- upsample_rates,
600
- upsample_initial_channel,
601
- upsample_kernel_sizes,
602
- gin_channels=gin_channels,
603
- sr=sr,
604
- is_half=kwargs["is_half"],
605
- )
606
- self.enc_q = PosteriorEncoder(
607
- spec_channels,
608
- inter_channels,
609
- hidden_channels,
610
- 5,
611
- 1,
612
- 16,
613
- gin_channels=gin_channels,
614
- )
615
- self.flow = ResidualCouplingBlock(
616
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
617
- )
618
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
619
- logger.debug(
620
- "gin_channels: "
621
- + str(gin_channels)
622
- + ", self.spk_embed_dim: "
623
- + str(self.spk_embed_dim)
624
- )
625
-
626
- def remove_weight_norm(self):
627
- self.dec.remove_weight_norm()
628
- self.flow.remove_weight_norm()
629
- self.enc_q.remove_weight_norm()
630
-
631
- def forward(
632
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
633
- ): # 这里ds是id,[bs,1]
634
- # print(1,pitch.shape)#[bs,t]
635
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
636
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
637
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
638
- z_p = self.flow(z, y_mask, g=g)
639
- z_slice, ids_slice = commons.rand_slice_segments(
640
- z, y_lengths, self.segment_size
641
- )
642
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
643
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
644
- # print(-2,pitchf.shape,z_slice.shape)
645
- o = self.dec(z_slice, pitchf, g=g)
646
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
647
-
648
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
649
- g = self.emb_g(sid).unsqueeze(-1)
650
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
651
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
652
- if rate:
653
- head = int(z_p.shape[2] * rate)
654
- z_p = z_p[:, :, -head:]
655
- x_mask = x_mask[:, :, -head:]
656
- nsff0 = nsff0[:, -head:]
657
- z = self.flow(z_p, x_mask, g=g, reverse=True)
658
- o = self.dec(z * x_mask, nsff0, g=g)
659
- return o, x_mask, (z, z_p, m_p, logs_p)
660
-
661
-
662
- class SynthesizerTrnMs768NSFsid(nn.Module):
663
- def __init__(
664
- self,
665
- spec_channels,
666
- segment_size,
667
- inter_channels,
668
- hidden_channels,
669
- filter_channels,
670
- n_heads,
671
- n_layers,
672
- kernel_size,
673
- p_dropout,
674
- resblock,
675
- resblock_kernel_sizes,
676
- resblock_dilation_sizes,
677
- upsample_rates,
678
- upsample_initial_channel,
679
- upsample_kernel_sizes,
680
- spk_embed_dim,
681
- gin_channels,
682
- sr,
683
- **kwargs
684
- ):
685
- super().__init__()
686
- if type(sr) == type("strr"):
687
- sr = sr2sr[sr]
688
- self.spec_channels = spec_channels
689
- self.inter_channels = inter_channels
690
- self.hidden_channels = hidden_channels
691
- self.filter_channels = filter_channels
692
- self.n_heads = n_heads
693
- self.n_layers = n_layers
694
- self.kernel_size = kernel_size
695
- self.p_dropout = p_dropout
696
- self.resblock = resblock
697
- self.resblock_kernel_sizes = resblock_kernel_sizes
698
- self.resblock_dilation_sizes = resblock_dilation_sizes
699
- self.upsample_rates = upsample_rates
700
- self.upsample_initial_channel = upsample_initial_channel
701
- self.upsample_kernel_sizes = upsample_kernel_sizes
702
- self.segment_size = segment_size
703
- self.gin_channels = gin_channels
704
- # self.hop_length = hop_length#
705
- self.spk_embed_dim = spk_embed_dim
706
- self.enc_p = TextEncoder768(
707
- inter_channels,
708
- hidden_channels,
709
- filter_channels,
710
- n_heads,
711
- n_layers,
712
- kernel_size,
713
- p_dropout,
714
- )
715
- self.dec = GeneratorNSF(
716
- inter_channels,
717
- resblock,
718
- resblock_kernel_sizes,
719
- resblock_dilation_sizes,
720
- upsample_rates,
721
- upsample_initial_channel,
722
- upsample_kernel_sizes,
723
- gin_channels=gin_channels,
724
- sr=sr,
725
- is_half=kwargs["is_half"],
726
- )
727
- self.enc_q = PosteriorEncoder(
728
- spec_channels,
729
- inter_channels,
730
- hidden_channels,
731
- 5,
732
- 1,
733
- 16,
734
- gin_channels=gin_channels,
735
- )
736
- self.flow = ResidualCouplingBlock(
737
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
738
- )
739
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
740
- logger.debug(
741
- "gin_channels: "
742
- + str(gin_channels)
743
- + ", self.spk_embed_dim: "
744
- + str(self.spk_embed_dim)
745
- )
746
-
747
- def remove_weight_norm(self):
748
- self.dec.remove_weight_norm()
749
- self.flow.remove_weight_norm()
750
- self.enc_q.remove_weight_norm()
751
-
752
- def forward(
753
- self, phone, phone_lengths, pitch, pitchf, y, y_lengths, ds
754
- ): # 这里ds是id,[bs,1]
755
- # print(1,pitch.shape)#[bs,t]
756
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
757
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
758
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
759
- z_p = self.flow(z, y_mask, g=g)
760
- z_slice, ids_slice = commons.rand_slice_segments(
761
- z, y_lengths, self.segment_size
762
- )
763
- # print(-1,pitchf.shape,ids_slice,self.segment_size,self.hop_length,self.segment_size//self.hop_length)
764
- pitchf = commons.slice_segments2(pitchf, ids_slice, self.segment_size)
765
- # print(-2,pitchf.shape,z_slice.shape)
766
- o = self.dec(z_slice, pitchf, g=g)
767
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
768
-
769
- def infer(self, phone, phone_lengths, pitch, nsff0, sid, rate=None):
770
- g = self.emb_g(sid).unsqueeze(-1)
771
- m_p, logs_p, x_mask = self.enc_p(phone, pitch, phone_lengths)
772
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
773
- if rate:
774
- head = int(z_p.shape[2] * rate)
775
- z_p = z_p[:, :, -head:]
776
- x_mask = x_mask[:, :, -head:]
777
- nsff0 = nsff0[:, -head:]
778
- z = self.flow(z_p, x_mask, g=g, reverse=True)
779
- o = self.dec(z * x_mask, nsff0, g=g)
780
- return o, x_mask, (z, z_p, m_p, logs_p)
781
-
782
-
783
- class SynthesizerTrnMs256NSFsid_nono(nn.Module):
784
- def __init__(
785
- self,
786
- spec_channels,
787
- segment_size,
788
- inter_channels,
789
- hidden_channels,
790
- filter_channels,
791
- n_heads,
792
- n_layers,
793
- kernel_size,
794
- p_dropout,
795
- resblock,
796
- resblock_kernel_sizes,
797
- resblock_dilation_sizes,
798
- upsample_rates,
799
- upsample_initial_channel,
800
- upsample_kernel_sizes,
801
- spk_embed_dim,
802
- gin_channels,
803
- sr=None,
804
- **kwargs
805
- ):
806
- super().__init__()
807
- self.spec_channels = spec_channels
808
- self.inter_channels = inter_channels
809
- self.hidden_channels = hidden_channels
810
- self.filter_channels = filter_channels
811
- self.n_heads = n_heads
812
- self.n_layers = n_layers
813
- self.kernel_size = kernel_size
814
- self.p_dropout = p_dropout
815
- self.resblock = resblock
816
- self.resblock_kernel_sizes = resblock_kernel_sizes
817
- self.resblock_dilation_sizes = resblock_dilation_sizes
818
- self.upsample_rates = upsample_rates
819
- self.upsample_initial_channel = upsample_initial_channel
820
- self.upsample_kernel_sizes = upsample_kernel_sizes
821
- self.segment_size = segment_size
822
- self.gin_channels = gin_channels
823
- # self.hop_length = hop_length#
824
- self.spk_embed_dim = spk_embed_dim
825
- self.enc_p = TextEncoder256(
826
- inter_channels,
827
- hidden_channels,
828
- filter_channels,
829
- n_heads,
830
- n_layers,
831
- kernel_size,
832
- p_dropout,
833
- f0=False,
834
- )
835
- self.dec = Generator(
836
- inter_channels,
837
- resblock,
838
- resblock_kernel_sizes,
839
- resblock_dilation_sizes,
840
- upsample_rates,
841
- upsample_initial_channel,
842
- upsample_kernel_sizes,
843
- gin_channels=gin_channels,
844
- )
845
- self.enc_q = PosteriorEncoder(
846
- spec_channels,
847
- inter_channels,
848
- hidden_channels,
849
- 5,
850
- 1,
851
- 16,
852
- gin_channels=gin_channels,
853
- )
854
- self.flow = ResidualCouplingBlock(
855
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
856
- )
857
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
858
- logger.debug(
859
- "gin_channels: "
860
- + str(gin_channels)
861
- + ", self.spk_embed_dim: "
862
- + str(self.spk_embed_dim)
863
- )
864
-
865
- def remove_weight_norm(self):
866
- self.dec.remove_weight_norm()
867
- self.flow.remove_weight_norm()
868
- self.enc_q.remove_weight_norm()
869
-
870
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
871
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
872
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
873
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
874
- z_p = self.flow(z, y_mask, g=g)
875
- z_slice, ids_slice = commons.rand_slice_segments(
876
- z, y_lengths, self.segment_size
877
- )
878
- o = self.dec(z_slice, g=g)
879
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
880
-
881
- def infer(self, phone, phone_lengths, sid, rate=None):
882
- g = self.emb_g(sid).unsqueeze(-1)
883
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
884
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
885
- if rate:
886
- head = int(z_p.shape[2] * rate)
887
- z_p = z_p[:, :, -head:]
888
- x_mask = x_mask[:, :, -head:]
889
- z = self.flow(z_p, x_mask, g=g, reverse=True)
890
- o = self.dec(z * x_mask, g=g)
891
- return o, x_mask, (z, z_p, m_p, logs_p)
892
-
893
-
894
- class SynthesizerTrnMs768NSFsid_nono(nn.Module):
895
- def __init__(
896
- self,
897
- spec_channels,
898
- segment_size,
899
- inter_channels,
900
- hidden_channels,
901
- filter_channels,
902
- n_heads,
903
- n_layers,
904
- kernel_size,
905
- p_dropout,
906
- resblock,
907
- resblock_kernel_sizes,
908
- resblock_dilation_sizes,
909
- upsample_rates,
910
- upsample_initial_channel,
911
- upsample_kernel_sizes,
912
- spk_embed_dim,
913
- gin_channels,
914
- sr=None,
915
- **kwargs
916
- ):
917
- super().__init__()
918
- self.spec_channels = spec_channels
919
- self.inter_channels = inter_channels
920
- self.hidden_channels = hidden_channels
921
- self.filter_channels = filter_channels
922
- self.n_heads = n_heads
923
- self.n_layers = n_layers
924
- self.kernel_size = kernel_size
925
- self.p_dropout = p_dropout
926
- self.resblock = resblock
927
- self.resblock_kernel_sizes = resblock_kernel_sizes
928
- self.resblock_dilation_sizes = resblock_dilation_sizes
929
- self.upsample_rates = upsample_rates
930
- self.upsample_initial_channel = upsample_initial_channel
931
- self.upsample_kernel_sizes = upsample_kernel_sizes
932
- self.segment_size = segment_size
933
- self.gin_channels = gin_channels
934
- # self.hop_length = hop_length#
935
- self.spk_embed_dim = spk_embed_dim
936
- self.enc_p = TextEncoder768(
937
- inter_channels,
938
- hidden_channels,
939
- filter_channels,
940
- n_heads,
941
- n_layers,
942
- kernel_size,
943
- p_dropout,
944
- f0=False,
945
- )
946
- self.dec = Generator(
947
- inter_channels,
948
- resblock,
949
- resblock_kernel_sizes,
950
- resblock_dilation_sizes,
951
- upsample_rates,
952
- upsample_initial_channel,
953
- upsample_kernel_sizes,
954
- gin_channels=gin_channels,
955
- )
956
- self.enc_q = PosteriorEncoder(
957
- spec_channels,
958
- inter_channels,
959
- hidden_channels,
960
- 5,
961
- 1,
962
- 16,
963
- gin_channels=gin_channels,
964
- )
965
- self.flow = ResidualCouplingBlock(
966
- inter_channels, hidden_channels, 5, 1, 3, gin_channels=gin_channels
967
- )
968
- self.emb_g = nn.Embedding(self.spk_embed_dim, gin_channels)
969
- logger.debug(
970
- "gin_channels: "
971
- + str(gin_channels)
972
- + ", self.spk_embed_dim: "
973
- + str(self.spk_embed_dim)
974
- )
975
-
976
- def remove_weight_norm(self):
977
- self.dec.remove_weight_norm()
978
- self.flow.remove_weight_norm()
979
- self.enc_q.remove_weight_norm()
980
-
981
- def forward(self, phone, phone_lengths, y, y_lengths, ds): # 这里ds是id,[bs,1]
982
- g = self.emb_g(ds).unsqueeze(-1) # [b, 256, 1]##1是t,广播的
983
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
984
- z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
985
- z_p = self.flow(z, y_mask, g=g)
986
- z_slice, ids_slice = commons.rand_slice_segments(
987
- z, y_lengths, self.segment_size
988
- )
989
- o = self.dec(z_slice, g=g)
990
- return o, ids_slice, x_mask, y_mask, (z, z_p, m_p, logs_p, m_q, logs_q)
991
-
992
- def infer(self, phone, phone_lengths, sid, rate=None):
993
- g = self.emb_g(sid).unsqueeze(-1)
994
- m_p, logs_p, x_mask = self.enc_p(phone, None, phone_lengths)
995
- z_p = (m_p + torch.exp(logs_p) * torch.randn_like(m_p) * 0.66666) * x_mask
996
- if rate:
997
- head = int(z_p.shape[2] * rate)
998
- z_p = z_p[:, :, -head:]
999
- x_mask = x_mask[:, :, -head:]
1000
- z = self.flow(z_p, x_mask, g=g, reverse=True)
1001
- o = self.dec(z * x_mask, g=g)
1002
- return o, x_mask, (z, z_p, m_p, logs_p)
1003
-
1004
-
1005
- class MultiPeriodDiscriminator(torch.nn.Module):
1006
- def __init__(self, use_spectral_norm=False):
1007
- super(MultiPeriodDiscriminator, self).__init__()
1008
- periods = [2, 3, 5, 7, 11, 17]
1009
- # periods = [3, 5, 7, 11, 17, 23, 37]
1010
-
1011
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1012
- discs = discs + [
1013
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1014
- ]
1015
- self.discriminators = nn.ModuleList(discs)
1016
-
1017
- def forward(self, y, y_hat):
1018
- y_d_rs = [] #
1019
- y_d_gs = []
1020
- fmap_rs = []
1021
- fmap_gs = []
1022
- for i, d in enumerate(self.discriminators):
1023
- y_d_r, fmap_r = d(y)
1024
- y_d_g, fmap_g = d(y_hat)
1025
- # for j in range(len(fmap_r)):
1026
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1027
- y_d_rs.append(y_d_r)
1028
- y_d_gs.append(y_d_g)
1029
- fmap_rs.append(fmap_r)
1030
- fmap_gs.append(fmap_g)
1031
-
1032
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1033
-
1034
-
1035
- class MultiPeriodDiscriminatorV2(torch.nn.Module):
1036
- def __init__(self, use_spectral_norm=False):
1037
- super(MultiPeriodDiscriminatorV2, self).__init__()
1038
- # periods = [2, 3, 5, 7, 11, 17]
1039
- periods = [2, 3, 5, 7, 11, 17, 23, 37]
1040
-
1041
- discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
1042
- discs = discs + [
1043
- DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
1044
- ]
1045
- self.discriminators = nn.ModuleList(discs)
1046
-
1047
- def forward(self, y, y_hat):
1048
- y_d_rs = [] #
1049
- y_d_gs = []
1050
- fmap_rs = []
1051
- fmap_gs = []
1052
- for i, d in enumerate(self.discriminators):
1053
- y_d_r, fmap_r = d(y)
1054
- y_d_g, fmap_g = d(y_hat)
1055
- # for j in range(len(fmap_r)):
1056
- # print(i,j,y.shape,y_hat.shape,fmap_r[j].shape,fmap_g[j].shape)
1057
- y_d_rs.append(y_d_r)
1058
- y_d_gs.append(y_d_g)
1059
- fmap_rs.append(fmap_r)
1060
- fmap_gs.append(fmap_g)
1061
-
1062
- return y_d_rs, y_d_gs, fmap_rs, fmap_gs
1063
-
1064
-
1065
- class DiscriminatorS(torch.nn.Module):
1066
- def __init__(self, use_spectral_norm=False):
1067
- super(DiscriminatorS, self).__init__()
1068
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1069
- self.convs = nn.ModuleList(
1070
- [
1071
- norm_f(Conv1d(1, 16, 15, 1, padding=7)),
1072
- norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
1073
- norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
1074
- norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
1075
- norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
1076
- norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
1077
- ]
1078
- )
1079
- self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
1080
-
1081
- def forward(self, x):
1082
- fmap = []
1083
-
1084
- for l in self.convs:
1085
- x = l(x)
1086
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1087
- fmap.append(x)
1088
- x = self.conv_post(x)
1089
- fmap.append(x)
1090
- x = torch.flatten(x, 1, -1)
1091
-
1092
- return x, fmap
1093
-
1094
-
1095
- class DiscriminatorP(torch.nn.Module):
1096
- def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
1097
- super(DiscriminatorP, self).__init__()
1098
- self.period = period
1099
- self.use_spectral_norm = use_spectral_norm
1100
- norm_f = weight_norm if use_spectral_norm == False else spectral_norm
1101
- self.convs = nn.ModuleList(
1102
- [
1103
- norm_f(
1104
- Conv2d(
1105
- 1,
1106
- 32,
1107
- (kernel_size, 1),
1108
- (stride, 1),
1109
- padding=(get_padding(kernel_size, 1), 0),
1110
- )
1111
- ),
1112
- norm_f(
1113
- Conv2d(
1114
- 32,
1115
- 128,
1116
- (kernel_size, 1),
1117
- (stride, 1),
1118
- padding=(get_padding(kernel_size, 1), 0),
1119
- )
1120
- ),
1121
- norm_f(
1122
- Conv2d(
1123
- 128,
1124
- 512,
1125
- (kernel_size, 1),
1126
- (stride, 1),
1127
- padding=(get_padding(kernel_size, 1), 0),
1128
- )
1129
- ),
1130
- norm_f(
1131
- Conv2d(
1132
- 512,
1133
- 1024,
1134
- (kernel_size, 1),
1135
- (stride, 1),
1136
- padding=(get_padding(kernel_size, 1), 0),
1137
- )
1138
- ),
1139
- norm_f(
1140
- Conv2d(
1141
- 1024,
1142
- 1024,
1143
- (kernel_size, 1),
1144
- 1,
1145
- padding=(get_padding(kernel_size, 1), 0),
1146
- )
1147
- ),
1148
- ]
1149
- )
1150
- self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
1151
-
1152
- def forward(self, x):
1153
- fmap = []
1154
-
1155
- # 1d to 2d
1156
- b, c, t = x.shape
1157
- if t % self.period != 0: # pad first
1158
- n_pad = self.period - (t % self.period)
1159
- if has_xpu and x.dtype == torch.bfloat16:
1160
- x = F.pad(x.to(dtype=torch.float16), (0, n_pad), "reflect").to(dtype=torch.bfloat16)
1161
- else:
1162
- x = F.pad(x, (0, n_pad), "reflect")
1163
- t = t + n_pad
1164
- x = x.view(b, c, t // self.period, self.period)
1165
-
1166
- for l in self.convs:
1167
- x = l(x)
1168
- x = F.leaky_relu(x, modules.LRELU_SLOPE)
1169
- fmap.append(x)
1170
- x = self.conv_post(x)
1171
- fmap.append(x)
1172
- x = torch.flatten(x, 1, -1)
1173
-
1174
- return x, fmap