ylacombe commited on
Commit
69e573b
·
1 Parent(s): d6e38a6

Create utils/voco_bark.py

Browse files
Files changed (1) hide show
  1. utils/voco_bark.py +207 -0
utils/voco_bark.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vocos import Vocos
2
+ from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel
3
+ from transformers.models.bark.generation_configuration_bark import (
4
+ BarkCoarseGenerationConfig,
5
+ BarkFineGenerationConfig,
6
+ BarkSemanticGenerationConfig,
7
+ )
8
+ from transformers import BarkConfig
9
+ import torch
10
+
11
+ class BarkModel(BarkPreTrainedModel):
12
+ config_class = BarkConfig
13
+
14
+ def __init__(self, config):
15
+ super().__init__(config)
16
+
17
+ self.semantic = BarkSemanticModel(config.semantic_config)
18
+ self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
19
+ self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
20
+
21
+ self.vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2")
22
+
23
+ self.config = config
24
+
25
+ @property
26
+ def device(self) -> torch.device:
27
+ """
28
+ `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
29
+ device).
30
+ """
31
+ # for bark_model, device must be verified on its sub-models
32
+ # if has _hf_hook, has been offloaded so the device has to be found in the hook
33
+ if not hasattr(self.semantic, "_hf_hook"):
34
+ return get_parameter_device(self)
35
+ for module in self.semantic.modules():
36
+ if (
37
+ hasattr(module, "_hf_hook")
38
+ and hasattr(module._hf_hook, "execution_device")
39
+ and module._hf_hook.execution_device is not None
40
+ ):
41
+ return torch.device(module._hf_hook.execution_device)
42
+
43
+ def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
44
+ r"""
45
+ Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
46
+ method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
47
+ the next sub-model runs.
48
+
49
+ Args:
50
+ gpu_id (`int`, *optional*, defaults to 0):
51
+ GPU id on which the sub-models will be loaded and offloaded.
52
+ """
53
+ if is_accelerate_available():
54
+ from accelerate import cpu_offload_with_hook
55
+ else:
56
+ raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
57
+
58
+ device = torch.device(f"cuda:{gpu_id}")
59
+
60
+ if self.device.type != "cpu":
61
+ self.to("cpu")
62
+ torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist)
63
+
64
+ # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
65
+ self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
66
+
67
+ hook = None
68
+ for cpu_offloaded_model in [
69
+ self.semantic,
70
+ self.coarse_acoustics,
71
+ self.fine_acoustics,
72
+ ]:
73
+ _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
74
+
75
+ self.fine_acoustics_hook = hook
76
+
77
+ _, hook = cpu_offload_with_hook(self.vocos, device, prev_module_hook=hook)
78
+
79
+ # We'll offload the last model manually.
80
+ self.codec_model_hook = hook
81
+
82
+
83
+
84
+ @torch.no_grad()
85
+ def generate(
86
+ self,
87
+ input_ids: Optional[torch.Tensor] = None,
88
+ history_prompt: Optional[Dict[str, torch.Tensor]] = None,
89
+ **kwargs,
90
+ ) -> torch.LongTensor:
91
+ """
92
+ Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
93
+
94
+ Args:
95
+ input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
96
+ Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
97
+ longest generation among the batch.
98
+ history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
99
+ Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
100
+ kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
101
+
102
+ - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
103
+ - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
104
+ semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
105
+
106
+ This means you can, for example, specify a generation strategy for all sub-models except one.
107
+ Returns:
108
+ torch.LongTensor: Output generated audio.
109
+
110
+ Example:
111
+
112
+ ```python
113
+ >>> from transformers import AutoProcessor, BarkModel
114
+
115
+ >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
116
+ >>> model = BarkModel.from_pretrained("suno/bark-small")
117
+
118
+ >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
119
+ >>> voice_preset = "v2/en_speaker_6"
120
+
121
+ >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
122
+
123
+ >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
124
+ >>> audio_array = audio_array.cpu().numpy().squeeze()
125
+ ```
126
+ """
127
+ # TODO (joao):workaround until nested generation config is compatible with PreTrained Model
128
+ # todo: dict
129
+ semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
130
+ coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
131
+ fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
132
+
133
+ kwargs_semantic = {
134
+ # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
135
+ "attention_mask": kwargs.pop("attention_mask", None)
136
+ }
137
+ kwargs_coarse = {}
138
+ kwargs_fine = {}
139
+ for key, value in kwargs.items():
140
+ if key.startswith("semantic_"):
141
+ key = key[len("semantic_") :]
142
+ kwargs_semantic[key] = value
143
+ elif key.startswith("coarse_"):
144
+ key = key[len("coarse_") :]
145
+ kwargs_coarse[key] = value
146
+ elif key.startswith("fine_"):
147
+ key = key[len("fine_") :]
148
+ kwargs_fine[key] = value
149
+ else:
150
+ # If the key is already in a specific config, then it's been set with a
151
+ # submodules specific value and we don't override
152
+ if key not in kwargs_semantic:
153
+ kwargs_semantic[key] = value
154
+ if key not in kwargs_coarse:
155
+ kwargs_coarse[key] = value
156
+ if key not in kwargs_fine:
157
+ kwargs_fine[key] = value
158
+
159
+ # 1. Generate from the semantic model
160
+ semantic_output = self.semantic.generate(
161
+ input_ids,
162
+ history_prompt=history_prompt,
163
+ semantic_generation_config=semantic_generation_config,
164
+ **kwargs_semantic,
165
+ )
166
+
167
+ # 2. Generate from the coarse model
168
+ coarse_output = self.coarse_acoustics.generate(
169
+ semantic_output,
170
+ history_prompt=history_prompt,
171
+ semantic_generation_config=semantic_generation_config,
172
+ coarse_generation_config=coarse_generation_config,
173
+ codebook_size=self.generation_config.codebook_size,
174
+ **kwargs_coarse,
175
+ )
176
+
177
+ # 3. "generate" from the fine model
178
+ output = self.fine_acoustics.generate(
179
+ coarse_output,
180
+ history_prompt=history_prompt,
181
+ semantic_generation_config=semantic_generation_config,
182
+ coarse_generation_config=coarse_generation_config,
183
+ fine_generation_config=fine_generation_config,
184
+ codebook_size=self.generation_config.codebook_size,
185
+ **kwargs_fine,
186
+ )
187
+
188
+ if getattr(self, "fine_acoustics_hook", None) is not None:
189
+ # Manually offload fine_acoustics to CPU
190
+ # and load codec_model to GPU
191
+ # since bark doesn't use codec_model forward pass
192
+ self.fine_acoustics_hook.offload()
193
+ self.vocos = self.vocos.to(self.device)
194
+
195
+ # 4. Decode the output and generate audio array
196
+ bandwidth_id = torch.tensor([2]).to(self.device)
197
+ # transpose
198
+ value = value.transpose(0,1)
199
+ value = self.vocos.codes_to_features(value)
200
+ value = self.vocos.decode(value, bandwidth_id=bandwidth_id)
201
+
202
+ if getattr(self, "codec_model_hook", None) is not None:
203
+ # Offload codec_model to CPU
204
+ self.vocos.offload()
205
+
206
+
207
+ return audio