Spaces:

hf-audio
/

vocos-bark

Runtime error

App Files Files Community

ylacombe commited on Oct 13, 2023

Commit

69e573b

1 Parent(s): d6e38a6

Create utils/voco_bark.py

Browse files

Files changed (1) hide show

utils/voco_bark.py +207 -0

utils/voco_bark.py ADDED Viewed

	@@ -0,0 +1,207 @@

+from vocos import Vocos
+from transformers.models.bark import BarkSemanticModel, BarkCoarseModel, BarkFineModel
+from transformers.models.bark.generation_configuration_bark import (
+    BarkCoarseGenerationConfig,
+    BarkFineGenerationConfig,
+    BarkSemanticGenerationConfig,
+)
+from transformers import BarkConfig
+import torch
+class BarkModel(BarkPreTrainedModel):
+    config_class = BarkConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.semantic = BarkSemanticModel(config.semantic_config)
+        self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config)
+        self.fine_acoustics = BarkFineModel(config.fine_acoustics_config)
+		self.vocos = Vocos.from_pretrained("hubertsiuzdak/vocos-encodec-24khz-v2")
+        self.config = config
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        # for bark_model, device must be verified on its sub-models
+        # if has _hf_hook, has been offloaded so the device has to be found in the hook
+        if not hasattr(self.semantic, "_hf_hook"):
+            return get_parameter_device(self)
+        for module in self.semantic.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+    def enable_cpu_offload(self, gpu_id: Optional[int] = 0):
+        r"""
+        Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This
+        method moves one whole sub-model at a time to the GPU when it is used, and the sub-model remains in GPU until
+        the next sub-model runs.
+        Args:
+            gpu_id (`int`, *optional*, defaults to 0):
+                GPU id on which the sub-models will be loaded and offloaded.
+        """
+        if is_accelerate_available():
+            from accelerate import cpu_offload_with_hook
+        else:
+            raise ImportError("`enable_model_cpu_offload` requires `accelerate`.")
+        device = torch.device(f"cuda:{gpu_id}")
+        if self.device.type != "cpu":
+            self.to("cpu")
+            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)
+        # this layer is used outside the first foward pass of semantic so need to be loaded before semantic
+        self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device)
+        hook = None
+        for cpu_offloaded_model in [
+            self.semantic,
+            self.coarse_acoustics,
+            self.fine_acoustics,
+        ]:
+            _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook)
+        self.fine_acoustics_hook = hook
+        _, hook = cpu_offload_with_hook(self.vocos, device, prev_module_hook=hook)
+        # We'll offload the last model manually.
+        self.codec_model_hook = hook
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        history_prompt: Optional[Dict[str, torch.Tensor]] = None,
+        **kwargs,
+    ) -> torch.LongTensor:
+        """
+        Generates audio from an input prompt and an additional optional `Bark` speaker prompt.
+        Args:
+            input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*):
+                Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the
+                longest generation among the batch.
+            history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*):
+                Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch.
+            kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types:
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model.
+                - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the
+                semantic, coarse and fine respectively. It has the priority over the keywords without a prefix.
+                This means you can, for example, specify a generation strategy for all sub-models except one.
+        Returns:
+            torch.LongTensor: Output generated audio.
+        Example:
+        ```python
+        >>> from transformers import AutoProcessor, BarkModel
+        >>> processor = AutoProcessor.from_pretrained("suno/bark-small")
+        >>> model = BarkModel.from_pretrained("suno/bark-small")
+        >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)`
+        >>> voice_preset = "v2/en_speaker_6"
+        >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset)
+        >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100)
+        >>> audio_array = audio_array.cpu().numpy().squeeze()
+        ```
+        """
+        # TODO (joao):workaround until nested generation config is compatible with PreTrained Model
+        # todo: dict
+        semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config)
+        coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config)
+        fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config)
+        kwargs_semantic = {
+            # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel
+            "attention_mask": kwargs.pop("attention_mask", None)
+        }
+        kwargs_coarse = {}
+        kwargs_fine = {}
+        for key, value in kwargs.items():
+            if key.startswith("semantic_"):
+                key = key[len("semantic_") :]
+                kwargs_semantic[key] = value
+            elif key.startswith("coarse_"):
+                key = key[len("coarse_") :]
+                kwargs_coarse[key] = value
+            elif key.startswith("fine_"):
+                key = key[len("fine_") :]
+                kwargs_fine[key] = value
+            else:
+                # If the key is already in a specific config, then it's been set with a
+                # submodules specific value and we don't override
+                if key not in kwargs_semantic:
+                    kwargs_semantic[key] = value
+                if key not in kwargs_coarse:
+                    kwargs_coarse[key] = value
+                if key not in kwargs_fine:
+                    kwargs_fine[key] = value
+        # 1. Generate from the semantic model
+        semantic_output = self.semantic.generate(
+            input_ids,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            **kwargs_semantic,
+        )
+        # 2. Generate from the coarse model
+        coarse_output = self.coarse_acoustics.generate(
+            semantic_output,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            codebook_size=self.generation_config.codebook_size,
+            **kwargs_coarse,
+        )
+        # 3. "generate" from the fine model
+        output = self.fine_acoustics.generate(
+            coarse_output,
+            history_prompt=history_prompt,
+            semantic_generation_config=semantic_generation_config,
+            coarse_generation_config=coarse_generation_config,
+            fine_generation_config=fine_generation_config,
+            codebook_size=self.generation_config.codebook_size,
+            **kwargs_fine,
+        )
+        if getattr(self, "fine_acoustics_hook", None) is not None:
+            # Manually offload fine_acoustics to CPU
+            # and load codec_model to GPU
+            # since bark doesn't use codec_model forward pass
+            self.fine_acoustics_hook.offload()
+            self.vocos = self.vocos.to(self.device)
+        # 4. Decode the output and generate audio array
+        bandwidth_id = torch.tensor([2]).to(self.device)
+        # transpose
+        value = value.transpose(0,1)
+        value = self.vocos.codes_to_features(value)
+        value = self.vocos.decode(value, bandwidth_id=bandwidth_id)
+        if getattr(self, "codec_model_hook", None) is not None:
+            # Offload codec_model to CPU
+            self.vocos.offload()
+        return audio