Update README.md

5993514 verified about 1 month ago

5.13 kB

	---
	base_model: unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit
	tags:
	- text-generation-inference
	- transformers
	- unsloth
	- llama
	- trl
	license: apache-2.0
	language:
	- en
	---

	# Uploaded Model

	- Developed by: AquaLabs
	- License: apache-2.0
	- Finetuned from model : unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit

	This LlaMa model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.

	# First, download requirements with following code:
	```py
	import os
	if "COLAB_" not in "".join(os.environ.keys()):
	!pip install unsloth
	else:
	!pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
	!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
	!pip install --no-deps unsloth
	!pip install snac
	```

	# Pipeline
	```py
	import torch
	from unsloth import FastLanguageModel
	from snac import SNAC
	from IPython.display import Audio, display
	import numpy as np

	TOKENISER_LENGTH = 128256
	START_OF_TEXT = 128000
	END_OF_TEXT = 128009
	START_OF_HUMAN = TOKENISER_LENGTH + 3
	END_OF_HUMAN = TOKENISER_LENGTH + 4
	START_OF_AI = TOKENISER_LENGTH + 5
	END_OF_AI = TOKENISER_LENGTH + 6

	GEN_START_TOKEN = 128259
	GEN_EOS_TOKEN = 128258
	GEN_END_EXTRA_TOKEN = 128260
	GEN_REMOVE_TOKEN = 128258
	CODE_OFFSET = 128266

	def load_models(HF_TOKEN):
	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="AquaLabs/Orpheus-3B-0.1-ft-Elise",
	max_seq_length=2048,
	token=HF_TOKEN
	)
	FastLanguageModel.for_inference(model)

	snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	snac_model = snac_model.to(device)

	return model, tokenizer, snac_model, device

	def redistribute_codes(code_list, snac_model, device):
	layer_1, layer_2, layer_3 = [], [], []
	num_groups = len(code_list) // 7
	for i in range(num_groups):
	group = code_list[7 * i: 7 * i + 7]
	layer_1.append(group[0])
	layer_2.append(group[1] - 4096)
	layer_3.append(group[2] - (2 * 4096))
	layer_3.append(group[3] - (3 * 4096))
	layer_2.append(group[4] - (4 * 4096))
	layer_3.append(group[5] - (5 * 4096))
	layer_3.append(group[6] - (6 * 4096))
	codes = [
	torch.tensor(layer_1).unsqueeze(0).to(device),
	torch.tensor(layer_2).unsqueeze(0).to(device),
	torch.tensor(layer_3).unsqueeze(0).to(device)
	]

	audio_waveform = snac_model.decode(codes)
	return audio_waveform

	def tts_pipeline(prompt, model, tokenizer, snac_model, device):
	input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

	start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device)
	end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device)
	modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1)

	attention_mask = torch.ones_like(modified_input_ids, device=device)

	generated_ids = model.generate(
	input_ids=modified_input_ids,
	attention_mask=attention_mask,
	max_new_tokens=1200,
	do_sample=True,
	temperature=0.6,
	top_p=0.95,
	repetition_penalty=1.1,
	num_return_sequences=1,
	eos_token_id=GEN_EOS_TOKEN,
	use_cache=True
	)

	marker_token = 128257
	token_indices = (generated_ids == marker_token).nonzero(as_tuple=True)
	if len(token_indices[1]) > 0:
	last_marker = token_indices[1][-1].item()
	cropped_tensor = generated_ids[:, last_marker + 1:]
	else:
	cropped_tensor = generated_ids

	processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN]

	row_length = processed_tokens.size(0)
	new_length = (row_length // 7) * 7
	trimmed_tokens = processed_tokens[:new_length]

	code_list = (trimmed_tokens - CODE_OFFSET).tolist()

	audio_waveform = redistribute_codes(code_list, snac_model, device)
	return audio_waveform

	if __name__ == "__main__":
	HF_TOKEN = "YOUR_TOKEN"
	model, tokenizer, snac_model, device = load_models(HF_TOKEN)

	prompt = "In the image, there is 2 man riding bike."
	audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device)

	audio_array = audio_output.detach().cpu().numpy()
	audio_array = np.squeeze(audio_array)

	if audio_array.ndim not in [1, 2]:
	raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape))

	display(Audio(audio_array, rate=24000))
	print("Audio generation complete.")
	```

	## Contributors

	- Ahmet Erdem Pamuk - [GitHub](https://github.com/ahmeterdempmk) \| [Hugging Face](https://huggingface.co/ahmeterdempmk)
	- Emir Kaan Özdemir - [GitHub](https://github.com/emirkaanozdemr) \| [Hugging Face](https://huggingface.co/emirkaanozdemr)
	- Şuayp Talha Kocabay - [GitHub](https://github.com/suayptalha) \| [Hugging Face](https://huggingface.co/suayptalha)

	Details are provided in the [paper]().