|
--- |
|
base_model: unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit |
|
tags: |
|
- text-generation-inference |
|
- transformers |
|
- unsloth |
|
- llama |
|
- trl |
|
license: apache-2.0 |
|
language: |
|
- en |
|
--- |
|
|
|
# Uploaded Model |
|
|
|
- **Developed by:** AquaLabs |
|
- **License:** apache-2.0 |
|
- **Finetuned from model :** unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit |
|
|
|
This LlaMa model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. |
|
|
|
# First, download requirements with following code: |
|
```py |
|
import os |
|
if "COLAB_" not in "".join(os.environ.keys()): |
|
!pip install unsloth |
|
else: |
|
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo |
|
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer |
|
!pip install --no-deps unsloth |
|
!pip install snac |
|
``` |
|
|
|
# Pipeline |
|
```py |
|
import torch |
|
from unsloth import FastLanguageModel |
|
from snac import SNAC |
|
from IPython.display import Audio, display |
|
import numpy as np |
|
|
|
TOKENISER_LENGTH = 128256 |
|
START_OF_TEXT = 128000 |
|
END_OF_TEXT = 128009 |
|
START_OF_HUMAN = TOKENISER_LENGTH + 3 |
|
END_OF_HUMAN = TOKENISER_LENGTH + 4 |
|
START_OF_AI = TOKENISER_LENGTH + 5 |
|
END_OF_AI = TOKENISER_LENGTH + 6 |
|
|
|
GEN_START_TOKEN = 128259 |
|
GEN_EOS_TOKEN = 128258 |
|
GEN_END_EXTRA_TOKEN = 128260 |
|
GEN_REMOVE_TOKEN = 128258 |
|
CODE_OFFSET = 128266 |
|
|
|
def load_models(HF_TOKEN): |
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name="AquaLabs/Orpheus-3B-0.1-ft-Elise", |
|
max_seq_length=2048, |
|
token=HF_TOKEN |
|
) |
|
FastLanguageModel.for_inference(model) |
|
|
|
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN) |
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
snac_model = snac_model.to(device) |
|
|
|
return model, tokenizer, snac_model, device |
|
|
|
def redistribute_codes(code_list, snac_model, device): |
|
layer_1, layer_2, layer_3 = [], [], [] |
|
num_groups = len(code_list) // 7 |
|
for i in range(num_groups): |
|
group = code_list[7 * i: 7 * i + 7] |
|
layer_1.append(group[0]) |
|
layer_2.append(group[1] - 4096) |
|
layer_3.append(group[2] - (2 * 4096)) |
|
layer_3.append(group[3] - (3 * 4096)) |
|
layer_2.append(group[4] - (4 * 4096)) |
|
layer_3.append(group[5] - (5 * 4096)) |
|
layer_3.append(group[6] - (6 * 4096)) |
|
codes = [ |
|
torch.tensor(layer_1).unsqueeze(0).to(device), |
|
torch.tensor(layer_2).unsqueeze(0).to(device), |
|
torch.tensor(layer_3).unsqueeze(0).to(device) |
|
] |
|
|
|
audio_waveform = snac_model.decode(codes) |
|
return audio_waveform |
|
|
|
def tts_pipeline(prompt, model, tokenizer, snac_model, device): |
|
input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device) |
|
|
|
start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device) |
|
end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device) |
|
modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1) |
|
|
|
attention_mask = torch.ones_like(modified_input_ids, device=device) |
|
|
|
generated_ids = model.generate( |
|
input_ids=modified_input_ids, |
|
attention_mask=attention_mask, |
|
max_new_tokens=1200, |
|
do_sample=True, |
|
temperature=0.6, |
|
top_p=0.95, |
|
repetition_penalty=1.1, |
|
num_return_sequences=1, |
|
eos_token_id=GEN_EOS_TOKEN, |
|
use_cache=True |
|
) |
|
|
|
marker_token = 128257 |
|
token_indices = (generated_ids == marker_token).nonzero(as_tuple=True) |
|
if len(token_indices[1]) > 0: |
|
last_marker = token_indices[1][-1].item() |
|
cropped_tensor = generated_ids[:, last_marker + 1:] |
|
else: |
|
cropped_tensor = generated_ids |
|
|
|
processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN] |
|
|
|
row_length = processed_tokens.size(0) |
|
new_length = (row_length // 7) * 7 |
|
trimmed_tokens = processed_tokens[:new_length] |
|
|
|
code_list = (trimmed_tokens - CODE_OFFSET).tolist() |
|
|
|
audio_waveform = redistribute_codes(code_list, snac_model, device) |
|
return audio_waveform |
|
|
|
if __name__ == "__main__": |
|
HF_TOKEN = "YOUR_TOKEN" |
|
model, tokenizer, snac_model, device = load_models(HF_TOKEN) |
|
|
|
prompt = "In the image, there is 2 man riding bike." |
|
audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device) |
|
|
|
audio_array = audio_output.detach().cpu().numpy() |
|
audio_array = np.squeeze(audio_array) |
|
|
|
if audio_array.ndim not in [1, 2]: |
|
raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape)) |
|
|
|
display(Audio(audio_array, rate=24000)) |
|
print("Audio generation complete.") |
|
``` |
|
|
|
## Contributors |
|
|
|
- Ahmet Erdem Pamuk - [GitHub](https://github.com/ahmeterdempmk) | [Hugging Face](https://huggingface.co/ahmeterdempmk) |
|
- Emir Kaan Özdemir - [GitHub](https://github.com/emirkaanozdemr) | [Hugging Face](https://huggingface.co/emirkaanozdemr) |
|
- Şuayp Talha Kocabay - [GitHub](https://github.com/suayptalha) | [Hugging Face](https://huggingface.co/suayptalha) |
|
|
|
Details are provided in the [paper](). |