File size: 5,129 Bytes
e1253c5 5993514 e1253c5 bd158f6 e1253c5 03848cc e1253c5 8b034b2 03848cc bd158f6 03848cc 5993514 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
---
base_model: unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit
tags:
- text-generation-inference
- transformers
- unsloth
- llama
- trl
license: apache-2.0
language:
- en
---
# Uploaded Model
- **Developed by:** AquaLabs
- **License:** apache-2.0
- **Finetuned from model :** unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit
This LlaMa model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
# First, download requirements with following code:
```py
import os
if "COLAB_" not in "".join(os.environ.keys()):
!pip install unsloth
else:
!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth
!pip install snac
```
# Pipeline
```py
import torch
from unsloth import FastLanguageModel
from snac import SNAC
from IPython.display import Audio, display
import numpy as np
TOKENISER_LENGTH = 128256
START_OF_TEXT = 128000
END_OF_TEXT = 128009
START_OF_HUMAN = TOKENISER_LENGTH + 3
END_OF_HUMAN = TOKENISER_LENGTH + 4
START_OF_AI = TOKENISER_LENGTH + 5
END_OF_AI = TOKENISER_LENGTH + 6
GEN_START_TOKEN = 128259
GEN_EOS_TOKEN = 128258
GEN_END_EXTRA_TOKEN = 128260
GEN_REMOVE_TOKEN = 128258
CODE_OFFSET = 128266
def load_models(HF_TOKEN):
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="AquaLabs/Orpheus-3B-0.1-ft-Elise",
max_seq_length=2048,
token=HF_TOKEN
)
FastLanguageModel.for_inference(model)
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN)
device = "cuda" if torch.cuda.is_available() else "cpu"
snac_model = snac_model.to(device)
return model, tokenizer, snac_model, device
def redistribute_codes(code_list, snac_model, device):
layer_1, layer_2, layer_3 = [], [], []
num_groups = len(code_list) // 7
for i in range(num_groups):
group = code_list[7 * i: 7 * i + 7]
layer_1.append(group[0])
layer_2.append(group[1] - 4096)
layer_3.append(group[2] - (2 * 4096))
layer_3.append(group[3] - (3 * 4096))
layer_2.append(group[4] - (4 * 4096))
layer_3.append(group[5] - (5 * 4096))
layer_3.append(group[6] - (6 * 4096))
codes = [
torch.tensor(layer_1).unsqueeze(0).to(device),
torch.tensor(layer_2).unsqueeze(0).to(device),
torch.tensor(layer_3).unsqueeze(0).to(device)
]
audio_waveform = snac_model.decode(codes)
return audio_waveform
def tts_pipeline(prompt, model, tokenizer, snac_model, device):
input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device)
end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device)
modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1)
attention_mask = torch.ones_like(modified_input_ids, device=device)
generated_ids = model.generate(
input_ids=modified_input_ids,
attention_mask=attention_mask,
max_new_tokens=1200,
do_sample=True,
temperature=0.6,
top_p=0.95,
repetition_penalty=1.1,
num_return_sequences=1,
eos_token_id=GEN_EOS_TOKEN,
use_cache=True
)
marker_token = 128257
token_indices = (generated_ids == marker_token).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_marker = token_indices[1][-1].item()
cropped_tensor = generated_ids[:, last_marker + 1:]
else:
cropped_tensor = generated_ids
processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN]
row_length = processed_tokens.size(0)
new_length = (row_length // 7) * 7
trimmed_tokens = processed_tokens[:new_length]
code_list = (trimmed_tokens - CODE_OFFSET).tolist()
audio_waveform = redistribute_codes(code_list, snac_model, device)
return audio_waveform
if __name__ == "__main__":
HF_TOKEN = "YOUR_TOKEN"
model, tokenizer, snac_model, device = load_models(HF_TOKEN)
prompt = "In the image, there is 2 man riding bike."
audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device)
audio_array = audio_output.detach().cpu().numpy()
audio_array = np.squeeze(audio_array)
if audio_array.ndim not in [1, 2]:
raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape))
display(Audio(audio_array, rate=24000))
print("Audio generation complete.")
```
## Contributors
- Ahmet Erdem Pamuk - [GitHub](https://github.com/ahmeterdempmk) | [Hugging Face](https://huggingface.co/ahmeterdempmk)
- Emir Kaan Özdemir - [GitHub](https://github.com/emirkaanozdemr) | [Hugging Face](https://huggingface.co/emirkaanozdemr)
- Şuayp Talha Kocabay - [GitHub](https://github.com/suayptalha) | [Hugging Face](https://huggingface.co/suayptalha)
Details are provided in the [paper](). |