Upload tokenizer

Browse files

Files changed (5) hide show

README.md +199 -0
config.json +151 -0
configuration_emova_speech_tokenizer.py +111 -0
model.safetensors +3 -0
modeling_emova_speech_tokenizer.py +78 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,151 @@

+{
+  "_name_or_path": "hf/",
+  "architectures": [
+    "EMOVASpeechTokenizer"
+  ],
+  "auto_map": {
+    "AutoConfig": [
+      "configuration_emova_speech_tokenizer.EMOVASpeechTokenizerConfig",
+      null
+    ],
+    "AutoModel": [
+      "modeling_emova_speech_tokenizer.EMOVASpeechTokenizer",
+      null
+    ]
+  },
+  "model_type": "EMOVASpeechTokenizer",
+  "s2u_unit_type": "40ms_multilingual_8888",
+  "torch_dtype": "float32",
+  "transformers_version": "4.44.2",
+  "u2s_dim_styles": 256,
+  "u2s_num_styles": 126,
+  "u2s_style2idx": {
+    "gender-female_emotion-angry_speed-fast_pitch-high": 0,
+    "gender-female_emotion-angry_speed-fast_pitch-low": 1,
+    "gender-female_emotion-angry_speed-fast_pitch-normal": 2,
+    "gender-female_emotion-angry_speed-normal_pitch-high": 3,
+    "gender-female_emotion-angry_speed-normal_pitch-low": 4,
+    "gender-female_emotion-angry_speed-normal_pitch-normal": 5,
+    "gender-female_emotion-angry_speed-slow_pitch-high": 6,
+    "gender-female_emotion-angry_speed-slow_pitch-low": 7,
+    "gender-female_emotion-angry_speed-slow_pitch-normal": 8,
+    "gender-female_emotion-disgusted_speed-fast_pitch-high": 9,
+    "gender-female_emotion-disgusted_speed-fast_pitch-low": 10,
+    "gender-female_emotion-disgusted_speed-fast_pitch-normal": 11,
+    "gender-female_emotion-disgusted_speed-normal_pitch-high": 12,
+    "gender-female_emotion-disgusted_speed-normal_pitch-low": 13,
+    "gender-female_emotion-disgusted_speed-normal_pitch-normal": 14,
+    "gender-female_emotion-disgusted_speed-slow_pitch-high": 15,
+    "gender-female_emotion-disgusted_speed-slow_pitch-low": 16,
+    "gender-female_emotion-disgusted_speed-slow_pitch-normal": 17,
+    "gender-female_emotion-fearful_speed-fast_pitch-high": 18,
+    "gender-female_emotion-fearful_speed-fast_pitch-low": 19,
+    "gender-female_emotion-fearful_speed-fast_pitch-normal": 20,
+    "gender-female_emotion-fearful_speed-normal_pitch-high": 21,
+    "gender-female_emotion-fearful_speed-normal_pitch-low": 22,
+    "gender-female_emotion-fearful_speed-normal_pitch-normal": 23,
+    "gender-female_emotion-fearful_speed-slow_pitch-high": 24,
+    "gender-female_emotion-fearful_speed-slow_pitch-low": 25,
+    "gender-female_emotion-fearful_speed-slow_pitch-normal": 26,
+    "gender-female_emotion-happy_speed-fast_pitch-high": 27,
+    "gender-female_emotion-happy_speed-fast_pitch-low": 28,
+    "gender-female_emotion-happy_speed-fast_pitch-normal": 29,
+    "gender-female_emotion-happy_speed-normal_pitch-high": 30,
+    "gender-female_emotion-happy_speed-normal_pitch-low": 31,
+    "gender-female_emotion-happy_speed-normal_pitch-normal": 32,
+    "gender-female_emotion-happy_speed-slow_pitch-high": 33,
+    "gender-female_emotion-happy_speed-slow_pitch-low": 34,
+    "gender-female_emotion-happy_speed-slow_pitch-normal": 35,
+    "gender-female_emotion-neutral_speed-fast_pitch-high": 36,
+    "gender-female_emotion-neutral_speed-fast_pitch-low": 37,
+    "gender-female_emotion-neutral_speed-fast_pitch-normal": 38,
+    "gender-female_emotion-neutral_speed-normal_pitch-high": 39,
+    "gender-female_emotion-neutral_speed-normal_pitch-low": 40,
+    "gender-female_emotion-neutral_speed-normal_pitch-normal": 41,
+    "gender-female_emotion-neutral_speed-slow_pitch-high": 42,
+    "gender-female_emotion-neutral_speed-slow_pitch-low": 43,
+    "gender-female_emotion-neutral_speed-slow_pitch-normal": 44,
+    "gender-female_emotion-sad_speed-fast_pitch-high": 45,
+    "gender-female_emotion-sad_speed-fast_pitch-low": 46,
+    "gender-female_emotion-sad_speed-fast_pitch-normal": 47,
+    "gender-female_emotion-sad_speed-normal_pitch-high": 48,
+    "gender-female_emotion-sad_speed-normal_pitch-low": 49,
+    "gender-female_emotion-sad_speed-normal_pitch-normal": 50,
+    "gender-female_emotion-sad_speed-slow_pitch-high": 51,
+    "gender-female_emotion-sad_speed-slow_pitch-low": 52,
+    "gender-female_emotion-sad_speed-slow_pitch-normal": 53,
+    "gender-female_emotion-surprised_speed-fast_pitch-high": 54,
+    "gender-female_emotion-surprised_speed-fast_pitch-low": 55,
+    "gender-female_emotion-surprised_speed-fast_pitch-normal": 56,
+    "gender-female_emotion-surprised_speed-normal_pitch-high": 57,
+    "gender-female_emotion-surprised_speed-normal_pitch-low": 58,
+    "gender-female_emotion-surprised_speed-normal_pitch-normal": 59,
+    "gender-female_emotion-surprised_speed-slow_pitch-high": 60,
+    "gender-female_emotion-surprised_speed-slow_pitch-low": 61,
+    "gender-female_emotion-surprised_speed-slow_pitch-normal": 62,
+    "gender-male_emotion-angry_speed-fast_pitch-high": 63,
+    "gender-male_emotion-angry_speed-fast_pitch-low": 64,
+    "gender-male_emotion-angry_speed-fast_pitch-normal": 65,
+    "gender-male_emotion-angry_speed-normal_pitch-high": 66,
+    "gender-male_emotion-angry_speed-normal_pitch-low": 67,
+    "gender-male_emotion-angry_speed-normal_pitch-normal": 68,
+    "gender-male_emotion-angry_speed-slow_pitch-high": 69,
+    "gender-male_emotion-angry_speed-slow_pitch-low": 70,
+    "gender-male_emotion-angry_speed-slow_pitch-normal": 71,
+    "gender-male_emotion-disgusted_speed-fast_pitch-high": 72,
+    "gender-male_emotion-disgusted_speed-fast_pitch-low": 73,
+    "gender-male_emotion-disgusted_speed-fast_pitch-normal": 74,
+    "gender-male_emotion-disgusted_speed-normal_pitch-high": 75,
+    "gender-male_emotion-disgusted_speed-normal_pitch-low": 76,
+    "gender-male_emotion-disgusted_speed-normal_pitch-normal": 77,
+    "gender-male_emotion-disgusted_speed-slow_pitch-high": 78,
+    "gender-male_emotion-disgusted_speed-slow_pitch-low": 79,
+    "gender-male_emotion-disgusted_speed-slow_pitch-normal": 80,
+    "gender-male_emotion-fearful_speed-fast_pitch-high": 81,
+    "gender-male_emotion-fearful_speed-fast_pitch-low": 82,
+    "gender-male_emotion-fearful_speed-fast_pitch-normal": 83,
+    "gender-male_emotion-fearful_speed-normal_pitch-high": 84,
+    "gender-male_emotion-fearful_speed-normal_pitch-low": 85,
+    "gender-male_emotion-fearful_speed-normal_pitch-normal": 86,
+    "gender-male_emotion-fearful_speed-slow_pitch-high": 87,
+    "gender-male_emotion-fearful_speed-slow_pitch-low": 88,
+    "gender-male_emotion-fearful_speed-slow_pitch-normal": 89,
+    "gender-male_emotion-happy_speed-fast_pitch-high": 90,
+    "gender-male_emotion-happy_speed-fast_pitch-low": 91,
+    "gender-male_emotion-happy_speed-fast_pitch-normal": 92,
+    "gender-male_emotion-happy_speed-normal_pitch-high": 93,
+    "gender-male_emotion-happy_speed-normal_pitch-low": 94,
+    "gender-male_emotion-happy_speed-normal_pitch-normal": 95,
+    "gender-male_emotion-happy_speed-slow_pitch-high": 96,
+    "gender-male_emotion-happy_speed-slow_pitch-low": 97,
+    "gender-male_emotion-happy_speed-slow_pitch-normal": 98,
+    "gender-male_emotion-neutral_speed-fast_pitch-high": 99,
+    "gender-male_emotion-neutral_speed-fast_pitch-low": 100,
+    "gender-male_emotion-neutral_speed-fast_pitch-normal": 101,
+    "gender-male_emotion-neutral_speed-normal_pitch-high": 102,
+    "gender-male_emotion-neutral_speed-normal_pitch-low": 103,
+    "gender-male_emotion-neutral_speed-normal_pitch-normal": 104,
+    "gender-male_emotion-neutral_speed-slow_pitch-high": 105,
+    "gender-male_emotion-neutral_speed-slow_pitch-low": 106,
+    "gender-male_emotion-neutral_speed-slow_pitch-normal": 107,
+    "gender-male_emotion-sad_speed-fast_pitch-high": 108,
+    "gender-male_emotion-sad_speed-fast_pitch-low": 109,
+    "gender-male_emotion-sad_speed-fast_pitch-normal": 110,
+    "gender-male_emotion-sad_speed-normal_pitch-high": 111,
+    "gender-male_emotion-sad_speed-normal_pitch-low": 112,
+    "gender-male_emotion-sad_speed-normal_pitch-normal": 113,
+    "gender-male_emotion-sad_speed-slow_pitch-high": 114,
+    "gender-male_emotion-sad_speed-slow_pitch-low": 115,
+    "gender-male_emotion-sad_speed-slow_pitch-normal": 116,
+    "gender-male_emotion-surprised_speed-fast_pitch-high": 117,
+    "gender-male_emotion-surprised_speed-fast_pitch-low": 118,
+    "gender-male_emotion-surprised_speed-fast_pitch-normal": 119,
+    "gender-male_emotion-surprised_speed-normal_pitch-high": 120,
+    "gender-male_emotion-surprised_speed-normal_pitch-low": 121,
+    "gender-male_emotion-surprised_speed-normal_pitch-normal": 122,
+    "gender-male_emotion-surprised_speed-slow_pitch-high": 123,
+    "gender-male_emotion-surprised_speed-slow_pitch-low": 124,
+    "gender-male_emotion-surprised_speed-slow_pitch-normal": 125
+  },
+  "u2s_unit_type": "40ms_multilingual_8888_xujing_cosyvoice_FT"
+}

configuration_emova_speech_tokenizer.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# coding=utf-8
+# Copyright 2024 The EMOVA team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" EMOVASpeechTokenizer model configuration """
+import copy
+from typing import List
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+U2S_STYLES = [
+    'gender-female_emotion-angry_speed-fast_pitch-high', 'gender-female_emotion-angry_speed-fast_pitch-low', 'gender-female_emotion-angry_speed-fast_pitch-normal',
+    'gender-female_emotion-angry_speed-normal_pitch-high', 'gender-female_emotion-angry_speed-normal_pitch-low', 'gender-female_emotion-angry_speed-normal_pitch-normal',
+    'gender-female_emotion-angry_speed-slow_pitch-high', 'gender-female_emotion-angry_speed-slow_pitch-low', 'gender-female_emotion-angry_speed-slow_pitch-normal',
+    'gender-female_emotion-disgusted_speed-fast_pitch-high', 'gender-female_emotion-disgusted_speed-fast_pitch-low', 'gender-female_emotion-disgusted_speed-fast_pitch-normal',
+    'gender-female_emotion-disgusted_speed-normal_pitch-high', 'gender-female_emotion-disgusted_speed-normal_pitch-low', 'gender-female_emotion-disgusted_speed-normal_pitch-normal',
+    'gender-female_emotion-disgusted_speed-slow_pitch-high', 'gender-female_emotion-disgusted_speed-slow_pitch-low', 'gender-female_emotion-disgusted_speed-slow_pitch-normal',
+    'gender-female_emotion-fearful_speed-fast_pitch-high', 'gender-female_emotion-fearful_speed-fast_pitch-low', 'gender-female_emotion-fearful_speed-fast_pitch-normal',
+    'gender-female_emotion-fearful_speed-normal_pitch-high', 'gender-female_emotion-fearful_speed-normal_pitch-low', 'gender-female_emotion-fearful_speed-normal_pitch-normal',
+    'gender-female_emotion-fearful_speed-slow_pitch-high', 'gender-female_emotion-fearful_speed-slow_pitch-low', 'gender-female_emotion-fearful_speed-slow_pitch-normal',
+    'gender-female_emotion-happy_speed-fast_pitch-high', 'gender-female_emotion-happy_speed-fast_pitch-low', 'gender-female_emotion-happy_speed-fast_pitch-normal',
+    'gender-female_emotion-happy_speed-normal_pitch-high', 'gender-female_emotion-happy_speed-normal_pitch-low', 'gender-female_emotion-happy_speed-normal_pitch-normal',
+    'gender-female_emotion-happy_speed-slow_pitch-high', 'gender-female_emotion-happy_speed-slow_pitch-low', 'gender-female_emotion-happy_speed-slow_pitch-normal',
+    'gender-female_emotion-neutral_speed-fast_pitch-high', 'gender-female_emotion-neutral_speed-fast_pitch-low', 'gender-female_emotion-neutral_speed-fast_pitch-normal',
+    'gender-female_emotion-neutral_speed-normal_pitch-high', 'gender-female_emotion-neutral_speed-normal_pitch-low', 'gender-female_emotion-neutral_speed-normal_pitch-normal',
+    'gender-female_emotion-neutral_speed-slow_pitch-high', 'gender-female_emotion-neutral_speed-slow_pitch-low', 'gender-female_emotion-neutral_speed-slow_pitch-normal',
+    'gender-female_emotion-sad_speed-fast_pitch-high', 'gender-female_emotion-sad_speed-fast_pitch-low', 'gender-female_emotion-sad_speed-fast_pitch-normal',
+    'gender-female_emotion-sad_speed-normal_pitch-high', 'gender-female_emotion-sad_speed-normal_pitch-low', 'gender-female_emotion-sad_speed-normal_pitch-normal',
+    'gender-female_emotion-sad_speed-slow_pitch-high', 'gender-female_emotion-sad_speed-slow_pitch-low', 'gender-female_emotion-sad_speed-slow_pitch-normal',
+    'gender-female_emotion-surprised_speed-fast_pitch-high', 'gender-female_emotion-surprised_speed-fast_pitch-low', 'gender-female_emotion-surprised_speed-fast_pitch-normal',
+    'gender-female_emotion-surprised_speed-normal_pitch-high', 'gender-female_emotion-surprised_speed-normal_pitch-low', 'gender-female_emotion-surprised_speed-normal_pitch-normal',
+    'gender-female_emotion-surprised_speed-slow_pitch-high', 'gender-female_emotion-surprised_speed-slow_pitch-low', 'gender-female_emotion-surprised_speed-slow_pitch-normal',
+    'gender-male_emotion-angry_speed-fast_pitch-high', 'gender-male_emotion-angry_speed-fast_pitch-low', 'gender-male_emotion-angry_speed-fast_pitch-normal',
+    'gender-male_emotion-angry_speed-normal_pitch-high', 'gender-male_emotion-angry_speed-normal_pitch-low', 'gender-male_emotion-angry_speed-normal_pitch-normal',
+    'gender-male_emotion-angry_speed-slow_pitch-high', 'gender-male_emotion-angry_speed-slow_pitch-low', 'gender-male_emotion-angry_speed-slow_pitch-normal',
+    'gender-male_emotion-disgusted_speed-fast_pitch-high', 'gender-male_emotion-disgusted_speed-fast_pitch-low', 'gender-male_emotion-disgusted_speed-fast_pitch-normal',
+    'gender-male_emotion-disgusted_speed-normal_pitch-high', 'gender-male_emotion-disgusted_speed-normal_pitch-low', 'gender-male_emotion-disgusted_speed-normal_pitch-normal',
+    'gender-male_emotion-disgusted_speed-slow_pitch-high', 'gender-male_emotion-disgusted_speed-slow_pitch-low', 'gender-male_emotion-disgusted_speed-slow_pitch-normal',
+    'gender-male_emotion-fearful_speed-fast_pitch-high', 'gender-male_emotion-fearful_speed-fast_pitch-low', 'gender-male_emotion-fearful_speed-fast_pitch-normal',
+    'gender-male_emotion-fearful_speed-normal_pitch-high', 'gender-male_emotion-fearful_speed-normal_pitch-low', 'gender-male_emotion-fearful_speed-normal_pitch-normal',
+    'gender-male_emotion-fearful_speed-slow_pitch-high', 'gender-male_emotion-fearful_speed-slow_pitch-low', 'gender-male_emotion-fearful_speed-slow_pitch-normal',
+    'gender-male_emotion-happy_speed-fast_pitch-high', 'gender-male_emotion-happy_speed-fast_pitch-low', 'gender-male_emotion-happy_speed-fast_pitch-normal',
+    'gender-male_emotion-happy_speed-normal_pitch-high', 'gender-male_emotion-happy_speed-normal_pitch-low', 'gender-male_emotion-happy_speed-normal_pitch-normal',
+    'gender-male_emotion-happy_speed-slow_pitch-high', 'gender-male_emotion-happy_speed-slow_pitch-low', 'gender-male_emotion-happy_speed-slow_pitch-normal',
+    'gender-male_emotion-neutral_speed-fast_pitch-high', 'gender-male_emotion-neutral_speed-fast_pitch-low', 'gender-male_emotion-neutral_speed-fast_pitch-normal',
+    'gender-male_emotion-neutral_speed-normal_pitch-high', 'gender-male_emotion-neutral_speed-normal_pitch-low', 'gender-male_emotion-neutral_speed-normal_pitch-normal',
+    'gender-male_emotion-neutral_speed-slow_pitch-high', 'gender-male_emotion-neutral_speed-slow_pitch-low', 'gender-male_emotion-neutral_speed-slow_pitch-normal',
+    'gender-male_emotion-sad_speed-fast_pitch-high', 'gender-male_emotion-sad_speed-fast_pitch-low', 'gender-male_emotion-sad_speed-fast_pitch-normal',
+    'gender-male_emotion-sad_speed-normal_pitch-high', 'gender-male_emotion-sad_speed-normal_pitch-low', 'gender-male_emotion-sad_speed-normal_pitch-normal',
+    'gender-male_emotion-sad_speed-slow_pitch-high', 'gender-male_emotion-sad_speed-slow_pitch-low', 'gender-male_emotion-sad_speed-slow_pitch-normal',
+    'gender-male_emotion-surprised_speed-fast_pitch-high', 'gender-male_emotion-surprised_speed-fast_pitch-low', 'gender-male_emotion-surprised_speed-fast_pitch-normal',
+    'gender-male_emotion-surprised_speed-normal_pitch-high', 'gender-male_emotion-surprised_speed-normal_pitch-low', 'gender-male_emotion-surprised_speed-normal_pitch-normal',
+    'gender-male_emotion-surprised_speed-slow_pitch-high', 'gender-male_emotion-surprised_speed-slow_pitch-low', 'gender-male_emotion-surprised_speed-slow_pitch-normal'
+]
+class EMOVASpeechTokenizerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`EMOVASpeechTokenizer`]. It is used to instantiate
+    a EMOVASpeechTokenizer model especially designed for training the EMOVA (https://arxiv.org/abs/2409.18042)
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a configuration to the speech tokenizer model presented in EMOVA paper.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        s2u_unit_type (`str`, defaults to `40ms_multilingual_8888`):
+            Unit type to specify model configurations for the speech-to-unit (S2U) encoder. Detailed configs will be found accordingly.
+        u2s_unit_type (`str`, defaults to `40ms_multilingual_8888_xujing_cosyvoice_FT`):
+            Unit type to specify model configurations for the unit-to-speech (U2S) decoder. Detailed configs will be found accordingly.
+        u2s_num_styles, u2s_dim_styles (`int`, defaults to 126 and 256):
+            Size of the style embedding matrix.
+    ```python
+    >>> from transformers import EMOVASpeechTokenizerConfig, EMOVASpeechTokenizer
+    >>> # Initializing a EMOVA speech tokenizer configuration
+    >>> configuration = EMOVASpeechTokenizerConfig()
+    >>> # Initializing a model from the EMOVA speech tokenizer configuration
+    >>> model = EMOVASpeechTokenizer(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "EMOVASpeechTokenizer"
+    def __init__(
+        self,
+        s2u_unit_type="40ms_multilingual_8888",
+        u2s_unit_type="40ms_multilingual_8888_xujing_cosyvoice_FT",
+        u2s_num_styles=126,
+        u2s_dim_styles=256,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.s2u_unit_type = s2u_unit_type
+        self.u2s_unit_type = u2s_unit_type
+        self.u2s_num_styles = u2s_num_styles
+        self.u2s_dim_styles = u2s_dim_styles
+        self.u2s_style2idx = {each:i for i, each in enumerate(U2S_STYLES)}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f01ab4b6cbe65d477f5d57e704f1324104f5437b1517c8938a194be2d98a7a6
+size 546644908

modeling_emova_speech_tokenizer.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# coding=utf-8
+# Copyright 2024 The EMOVA team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" EMOVASpeechTokenizer model """
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from transformers.modeling_utils import PreTrainedModel
+try:
+    from emova_speech_tokenizer.speech_utils import get_S2U_ckpt_config_path, load_config, VQCTCFinetuneModel, s2u_extract_unit_demo
+    from emova_speech_tokenizer.speech_utils import get_U2S_config_checkpoint_file, load_U2S_config, SynthesizerTrn, synthesis
+except:
+    raise ImportError('Dependencies of emova speech tokenizer are not installed properly. Check https://huggingface.co/Emova-ollm/emova_speech_tokenizer#install for detailed instructions.')
+from .configuration_emova_speech_tokenizer import EMOVASpeechTokenizerConfig
+class EMOVASpeechTokenizer(PreTrainedModel):
+    config_class = EMOVASpeechTokenizerConfig
+    base_model_prefix = "emova_speech_tokenizer"
+    def __init__(self, config: EMOVASpeechTokenizerConfig):
+        super().__init__(config)
+        self.config = config
+        # s2u encoder configs
+        _, S2U_config_path = get_S2U_ckpt_config_path(config.s2u_unit_type)
+        s2u_cfg = load_config(config=S2U_config_path)
+        s2u_cfg.model.pretrain_chkpt_path = None
+        # u2s decoder configs
+        U2S_config_file, _ = get_U2S_config_checkpoint_file(config.u2s_unit_type)
+        u2s_cfg = load_U2S_config(U2S_config_file)
+        # construct models
+        self.s2u_config = s2u_cfg.model
+        self.u2s_config = u2s_cfg
+        self.encoder = VQCTCFinetuneModel(s2u_cfg.model, trainer=None)
+        self.decoder = SynthesizerTrn(
+            u2s_cfg.num_symbols,
+            u2s_cfg.data.filter_length // 2 + 1,
+            u2s_cfg.train.segment_size // u2s_cfg.data.hop_length,
+            n_speakers=u2s_cfg.data.n_speakers,
+            **u2s_cfg.model
+        )
+        self.style_embedding = nn.Embedding(config.u2s_num_styles, config.u2s_dim_styles)
+    @property
+    def device(self):
+        return next(self.encoder.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.encoder.parameters()).dtype
+    def encode(self, wav_file):
+        speech_unit = s2u_extract_unit_demo(self.encoder, wav_file, model_name='SPIRAL-FSQ-CTC', reduced=True)
+        return speech_unit
+    def decode(self, speech_unit, condition=None, output_wav_file='output.wav'):
+        content_unit = speech_unit.replace('<|speech_', '').replace('|>', ' ').strip()
+        style_centroid_embedding = self.style_embedding(torch.LongTensor([self.config.u2s_style2idx[condition]]).to(self.device)).unsqueeze(-1) if condition else None
+        audio = synthesis(content_unit, style_centroid_embedding, self.u2s_config, self.decoder, output_wav_file)
+        return audio