--- base_model: unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit tags: - text-generation-inference - transformers - unsloth - llama - trl license: apache-2.0 language: - en --- # Uploaded Model - **Developed by:** AquaLabs - **License:** apache-2.0 - **Finetuned from model :** unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit This LlaMa model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. # First, download requirements with following code: ```py import os if "COLAB_" not in "".join(os.environ.keys()): !pip install unsloth else: !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer !pip install --no-deps unsloth !pip install snac ``` # Pipeline ```py import torch from unsloth import FastLanguageModel from snac import SNAC from IPython.display import Audio, display import numpy as np TOKENISER_LENGTH = 128256 START_OF_TEXT = 128000 END_OF_TEXT = 128009 START_OF_HUMAN = TOKENISER_LENGTH + 3 END_OF_HUMAN = TOKENISER_LENGTH + 4 START_OF_AI = TOKENISER_LENGTH + 5 END_OF_AI = TOKENISER_LENGTH + 6 GEN_START_TOKEN = 128259 GEN_EOS_TOKEN = 128258 GEN_END_EXTRA_TOKEN = 128260 GEN_REMOVE_TOKEN = 128258 CODE_OFFSET = 128266 def load_models(HF_TOKEN): model, tokenizer = FastLanguageModel.from_pretrained( model_name="AquaLabs/Orpheus-3B-0.1-ft-Elise", max_seq_length=2048, token=HF_TOKEN ) FastLanguageModel.for_inference(model) snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN) device = "cuda" if torch.cuda.is_available() else "cpu" snac_model = snac_model.to(device) return model, tokenizer, snac_model, device def redistribute_codes(code_list, snac_model, device): layer_1, layer_2, layer_3 = [], [], [] num_groups = len(code_list) // 7 for i in range(num_groups): group = code_list[7 * i: 7 * i + 7] layer_1.append(group[0]) layer_2.append(group[1] - 4096) layer_3.append(group[2] - (2 * 4096)) layer_3.append(group[3] - (3 * 4096)) layer_2.append(group[4] - (4 * 4096)) layer_3.append(group[5] - (5 * 4096)) layer_3.append(group[6] - (6 * 4096)) codes = [ torch.tensor(layer_1).unsqueeze(0).to(device), torch.tensor(layer_2).unsqueeze(0).to(device), torch.tensor(layer_3).unsqueeze(0).to(device) ] audio_waveform = snac_model.decode(codes) return audio_waveform def tts_pipeline(prompt, model, tokenizer, snac_model, device): input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device) start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device) end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device) modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1) attention_mask = torch.ones_like(modified_input_ids, device=device) generated_ids = model.generate( input_ids=modified_input_ids, attention_mask=attention_mask, max_new_tokens=1200, do_sample=True, temperature=0.6, top_p=0.95, repetition_penalty=1.1, num_return_sequences=1, eos_token_id=GEN_EOS_TOKEN, use_cache=True ) marker_token = 128257 token_indices = (generated_ids == marker_token).nonzero(as_tuple=True) if len(token_indices[1]) > 0: last_marker = token_indices[1][-1].item() cropped_tensor = generated_ids[:, last_marker + 1:] else: cropped_tensor = generated_ids processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN] row_length = processed_tokens.size(0) new_length = (row_length // 7) * 7 trimmed_tokens = processed_tokens[:new_length] code_list = (trimmed_tokens - CODE_OFFSET).tolist() audio_waveform = redistribute_codes(code_list, snac_model, device) return audio_waveform if __name__ == "__main__": HF_TOKEN = "YOUR_TOKEN" model, tokenizer, snac_model, device = load_models(HF_TOKEN) prompt = "In the image, there is 2 man riding bike." audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device) audio_array = audio_output.detach().cpu().numpy() audio_array = np.squeeze(audio_array) if audio_array.ndim not in [1, 2]: raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape)) display(Audio(audio_array, rate=24000)) print("Audio generation complete.") ```