AquaLabs
/

Orpheus-3B-0.1-ft-Elise

@@ -11,12 +11,125 @@ language:
 - en
 ---
-# Uploaded  model
 - **Developed by:** ahmeterdempmk
 - **License:** apache-2.0
 - **Finetuned from model :** unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit
-This llama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
-[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)

 - en
 ---
+# Uploaded  Model
 - **Developed by:** ahmeterdempmk
 - **License:** apache-2.0
 - **Finetuned from model :** unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit
+This LlaMa model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
+# Pipeline
+```py
+import torch
+from unsloth import FastLanguageModel
+from snac import SNAC
+from IPython.display import Audio, display
+import numpy as np
+TOKENISER_LENGTH = 128256
+START_OF_TEXT = 128000
+END_OF_TEXT = 128009
+START_OF_HUMAN = TOKENISER_LENGTH + 3
+END_OF_HUMAN = TOKENISER_LENGTH + 4
+START_OF_AI = TOKENISER_LENGTH + 5
+END_OF_AI = TOKENISER_LENGTH + 6
+GEN_START_TOKEN = 128259
+GEN_EOS_TOKEN = 128258
+GEN_END_EXTRA_TOKEN = 128260
+GEN_REMOVE_TOKEN = 128258
+CODE_OFFSET = 128266
+def load_models(HF_TOKEN):
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name="ahmeterdempmk/Orpheus-3B-0.1-ft-Elise",
+        max_seq_length=2048,
+        token=HF_TOKEN
+    )
+    FastLanguageModel.for_inference(model)
+    snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz", token=HF_TOKEN)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    snac_model = snac_model.to(device)
+    return model, tokenizer, snac_model, device
+def redistribute_codes(code_list, snac_model, device):
+    layer_1, layer_2, layer_3 = [], [], []
+    num_groups = len(code_list) // 7
+    for i in range(num_groups):
+        group = code_list[7 * i: 7 * i + 7]
+        layer_1.append(group[0])
+        layer_2.append(group[1] - 4096)
+        layer_3.append(group[2] - (2 * 4096))
+        layer_3.append(group[3] - (3 * 4096))
+        layer_2.append(group[4] - (4 * 4096))
+        layer_3.append(group[5] - (5 * 4096))
+        layer_3.append(group[6] - (6 * 4096))
+    codes = [
+        torch.tensor(layer_1).unsqueeze(0).to(device),
+        torch.tensor(layer_2).unsqueeze(0).to(device),
+        torch.tensor(layer_3).unsqueeze(0).to(device)
+    ]
+    audio_waveform = snac_model.decode(codes)
+    return audio_waveform
+def tts_pipeline(prompt, model, tokenizer, snac_model, device):
+    input_ids_tensor = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    start_token = torch.tensor([[GEN_START_TOKEN]], dtype=torch.int64, device=device)
+    end_tokens = torch.tensor([[END_OF_TEXT, GEN_END_EXTRA_TOKEN]], dtype=torch.int64, device=device)
+    modified_input_ids = torch.cat([start_token, input_ids_tensor, end_tokens], dim=1)
+    attention_mask = torch.ones_like(modified_input_ids, device=device)
+    generated_ids = model.generate(
+        input_ids=modified_input_ids,
+        attention_mask=attention_mask,
+        max_new_tokens=1200,
+        do_sample=True,
+        temperature=0.6,
+        top_p=0.95,
+        repetition_penalty=1.1,
+        num_return_sequences=1,
+        eos_token_id=GEN_EOS_TOKEN,
+        use_cache=True
+    )
+    marker_token = 128257
+    token_indices = (generated_ids == marker_token).nonzero(as_tuple=True)
+    if len(token_indices[1]) > 0:
+        last_marker = token_indices[1][-1].item()
+        cropped_tensor = generated_ids[:, last_marker + 1:]
+    else:
+        cropped_tensor = generated_ids
+    processed_tokens = cropped_tensor[cropped_tensor != GEN_REMOVE_TOKEN]
+    row_length = processed_tokens.size(0)
+    new_length = (row_length // 7) * 7
+    trimmed_tokens = processed_tokens[:new_length]
+    code_list = (trimmed_tokens - CODE_OFFSET).tolist()
+    audio_waveform = redistribute_codes(code_list, snac_model, device)
+    return audio_waveform
+if __name__ == "__main__":
+    HF_TOKEN = "YOUR_TOKEN"
+    model, tokenizer, snac_model, device = load_models(HF_TOKEN)
+    prompt = "In the image, there is 2 man riding bike."
+    audio_output = tts_pipeline(prompt, model, tokenizer, snac_model, device)
+    audio_array = audio_output.detach().cpu().numpy()
+    audio_array = np.squeeze(audio_array)
+    if audio_array.ndim not in [1, 2]:
+        raise ValueError("Array audio input must be a 1D or 2D array, but got shape: " + str(audio_array.shape))
+    display(Audio(audio_array, rate=24000))
+    print("Audio generation complete.")
+```